Check in pre-generated perlasm and error data files
This adds a tool for managing pre-generated files, aligning our CMake
and non-CMake builds. The plan is roughly:
The source of truth for the file lists will (eventually) be build.json.
This describes the build in terms of the files that we directly edit.
However, we have a two-phase build. First a pregeneration step
transforms some of the less convenient inputs into checked in files.
Notably perlasm files get expanded. This produces an equivalent JSON
structure with fewer inputs. The same tool then outputs that structure
into whatever build systems we want.
This initial version pre-generates err_data.c and perlasm files. I've
not wired up the various build formats, except for CMake (for the CMake
build to consume) and JSON (for generate_build_files.py to parse).
build.json is also, for now, only a subset of the build. Later changes
The upshot of all this is we no longer have a Perl build dependency!
Perl is now only needed when working on BoringSSL. It nearly removes the
Go one, but Go is still needed to run and (for now) build the tests.
To keep the generated files up-to-date, once this lands, I'll update our
CI to run `go run ./util/pregenerate -check` which asserts that all
generated files are correct. From there we can land the later changes in
this patch series that uses this more extensively. My eventual goal is
to replace generate_build_files.py altogether and the
"master-with-bazel" branch. Instead we'll just have sources.bzl,
sources.gni, etc. all checked into the tree directly. And then the
normal branch will just have both a CMake and Bazel build in it.
Update-Note: generate_build_files.py no longer generates assembly files
or err_data.c. Those are now checked into the tree directly.
Bug: 542
Change-Id: I71f5ff7417be811f8b7888b345279474e6b38ee9
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/67288
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/BUILDING.md b/BUILDING.md
index fb28e89..e10d964 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -16,15 +16,6 @@
* [CMake](https://cmake.org/download/) 3.12 or later is required.
- * A recent version of Perl is required. On Windows,
- [Active State Perl](http://www.activestate.com/activeperl/) has been
- reported to work, as has MSYS Perl.
- [Strawberry Perl](http://strawberryperl.com/) also works but it adds GCC
- to `PATH`, which can confuse some build tools when identifying the compiler
- (removing `C:\Strawberry\c\bin` from `PATH` should resolve any problems).
- If Perl is not found by CMake, it may be configured explicitly by setting
- `PERL_EXECUTABLE`.
-
* Building with [Ninja](https://ninja-build.org/) instead of Make is
recommended, because it makes builds faster. On Windows, CMake's Visual
Studio generator may also work, but it not tested regularly and requires
@@ -211,3 +202,17 @@
Both sets of tests may also be run with `ninja -C build run_tests`, but CMake
3.2 or later is required to avoid Ninja's output buffering.
+
+# Pre-generated Files
+
+If modifying perlasm files, or `util/pregenerate/build.json`, you will need to
+run `go run ./util/pregenerate` to refresh some pre-generated files. To do this,
+you will need a recent version of Perl.
+
+On Windows, [Active State Perl](http://www.activestate.com/activeperl/) has been
+reported to work, as has MSYS Perl.
+[Strawberry Perl](http://strawberryperl.com/) also works but it adds GCC
+to `PATH`, which can confuse some build tools when identifying the compiler
+(removing `C:\Strawberry\c\bin` from `PATH` should resolve any problems).
+
+See (gen/README.md)[./gen/README.md] for more details.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59623e0..1410c43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@
include(sources.cmake)
include(cmake/go.cmake)
include(cmake/paths.cmake)
-include(cmake/perlasm.cmake)
+include(gen/sources.cmake)
enable_language(C)
enable_language(CXX)
@@ -43,8 +43,6 @@
endif()
endfunction()
-find_package(Perl REQUIRED)
-
if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND NOT CMAKE_CROSSCOMPILING)
find_package(PkgConfig QUIET)
if (PkgConfig_FOUND)
@@ -530,7 +528,13 @@
target_link_libraries(decrepit crypto ssl)
add_library(test_support_lib STATIC ${TEST_SUPPORT_SOURCES})
-if (LIBUNWIND_FOUND)
+if(OPENSSL_ASM)
+ target_sources(test_support_lib PRIVATE ${TEST_SUPPORT_SOURCES_ASM})
+endif()
+if(OPENSSL_NASM)
+ target_sources(test_support_lib PRIVATE ${TEST_SUPPORT_SOURCES_NASM})
+endif()
+if(LIBUNWIND_FOUND)
target_compile_options(test_support_lib PRIVATE ${LIBUNWIND_CFLAGS_OTHER})
target_include_directories(test_support_lib PRIVATE ${LIBUNWIND_INCLUDE_DIRS})
target_link_libraries(test_support_lib ${LIBUNWIND_LDFLAGS})
diff --git a/build.json b/build.json
new file mode 100644
index 0000000..0bf49a7
--- /dev/null
+++ b/build.json
@@ -0,0 +1,136 @@
+// This file defines BoringSSL's build, expressed in terms of the input source
+// files that BoringSSL developers edit. It is a JSON file with line comments,
+// with line comments removed before parsing. It drives ./util/pregenerate which
+// converts some of those inputs (e.g. perlasm files) into pre-generated
+// outputs. This produces a more simplified build, which is then converted into
+// build files of various syntaxes.
+//
+// When modifying this file, run `go run ./util/pregenerate`. See gen/README.md
+// for more details, and util/pregenerate/build.go for the schema.
+//
+// TODO(crbug.com/boringssl/542): Moving build inputs to this file is still work
+// in progress, so this file is currently incomplete.
+{
+ "bcm": {
+ "perlasm_aarch64": [
+ {"src": "crypto/fipsmodule/aes/asm/aesv8-armx.pl", "dst": "aesv8-armv8"},
+ {"src": "crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/armv8-mont.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/bn-armv8.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghashv8-armx.pl", "dst": "ghashv8-armv8"},
+ {"src": "crypto/fipsmodule/ec/asm/p256_beeu-armv8-asm.pl"},
+ {"src": "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha1-armv8.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha512-armv8.pl", "args": ["sha256"], "dst": "sha256-armv8"},
+ {"src": "crypto/fipsmodule/sha/asm/sha512-armv8.pl", "args": ["sha512"]},
+ {"src": "crypto/fipsmodule/aes/asm/vpaes-armv8.pl"}
+ ],
+ "perlasm_arm": [
+ {"src": "crypto/fipsmodule/aes/asm/aesv8-armx.pl", "dst": "aesv8-armv7"},
+ {"src": "crypto/fipsmodule/bn/asm/armv4-mont.pl"},
+ {"src": "crypto/fipsmodule/aes/asm/bsaes-armv7.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghash-armv4.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghashv8-armx.pl", "dst": "ghashv8-armv7"},
+ {"src": "crypto/fipsmodule/sha/asm/sha1-armv4-large.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha256-armv4.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha512-armv4.pl"},
+ {"src": "crypto/fipsmodule/aes/asm/vpaes-armv7.pl"}
+ ],
+ "perlasm_x86": [
+ {"src": "crypto/fipsmodule/aes/asm/aesni-x86.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/bn-586.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/co-586.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghash-x86.pl"},
+ {"src": "crypto/fipsmodule/md5/asm/md5-586.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha1-586.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha256-586.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha512-586.pl"},
+ {"src": "crypto/fipsmodule/aes/asm/vpaes-x86.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/x86-mont.pl"}
+ ],
+ "perlasm_x86_64": [
+ {"src": "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl"},
+ {"src": "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl"},
+ {"src": "crypto/fipsmodule/modes/asm/ghash-x86_64.pl"},
+ {"src": "crypto/fipsmodule/md5/asm/md5-x86_64.pl"},
+ {"src": "crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl"},
+ {"src": "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl"},
+ {"src": "crypto/fipsmodule/rand/asm/rdrand-x86_64.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/rsaz-avx2.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha1-x86_64.pl"},
+ {"src": "crypto/fipsmodule/sha/asm/sha512-x86_64.pl", "args": ["sha256"], "dst": "sha256-x86_64"},
+ {"src": "crypto/fipsmodule/sha/asm/sha512-x86_64.pl", "args": ["sha512"]},
+ {"src": "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/x86_64-mont.pl"},
+ {"src": "crypto/fipsmodule/bn/asm/x86_64-mont5.pl"}
+ ]
+ },
+ "crypto": {
+ "err_data": [
+ "crypto/err/asn1.errordata",
+ "crypto/err/bio.errordata",
+ "crypto/err/bn.errordata",
+ "crypto/err/cipher.errordata",
+ "crypto/err/conf.errordata",
+ "crypto/err/dh.errordata",
+ "crypto/err/digest.errordata",
+ "crypto/err/dsa.errordata",
+ "crypto/err/ecdh.errordata",
+ "crypto/err/ecdsa.errordata",
+ "crypto/err/ec.errordata",
+ "crypto/err/engine.errordata",
+ "crypto/err/evp.errordata",
+ "crypto/err/hkdf.errordata",
+ "crypto/err/obj.errordata",
+ "crypto/err/pem.errordata",
+ "crypto/err/pkcs7.errordata",
+ "crypto/err/pkcs8.errordata",
+ "crypto/err/rsa.errordata",
+ "crypto/err/ssl.errordata",
+ "crypto/err/trust_token.errordata",
+ "crypto/err/x509.errordata",
+ "crypto/err/x509v3.errordata"
+ ],
+ "asm": [
+ "crypto/curve25519/asm/x25519-asm-arm.S",
+ "crypto/hrss/asm/poly_rq_mul.S",
+ "crypto/poly1305/poly1305_arm_asm.S",
+ "third_party/fiat/asm/fiat_curve25519_adx_mul.S",
+ "third_party/fiat/asm/fiat_curve25519_adx_square.S",
+ "third_party/fiat/asm/fiat_p256_adx_mul.S",
+ "third_party/fiat/asm/fiat_p256_adx_sqr.S"
+ ],
+ "perlasm_aarch64": [
+ {"src": "crypto/chacha/asm/chacha-armv8.pl"},
+ {"src": "crypto/cipher_extra/asm/chacha20_poly1305_armv8.pl"}
+ ],
+ "perlasm_arm": [
+ {"src": "crypto/chacha/asm/chacha-armv4.pl"}
+ ],
+ "perlasm_x86": [
+ {"src": "crypto/chacha/asm/chacha-x86.pl"}
+ ],
+ "perlasm_x86_64": [
+ {"src": "crypto/chacha/asm/chacha-x86_64.pl"},
+ {"src": "crypto/cipher_extra/asm/aes128gcmsiv-x86_64.pl"},
+ {"src": "crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl"}
+ ]
+ },
+ "test_support": {
+ "perlasm_aarch64": [
+ {"src": "crypto/test/asm/trampoline-armv8.pl"}
+ ],
+ "perlasm_arm": [
+ {"src": "crypto/test/asm/trampoline-armv4.pl"}
+ ],
+ "perlasm_x86": [
+ {"src": "crypto/test/asm/trampoline-x86.pl"}
+ ],
+ "perlasm_x86_64": [
+ {"src": "crypto/test/asm/trampoline-x86_64.pl"}
+ ]
+ }
+}
diff --git a/cmake/perlasm.cmake b/cmake/perlasm.cmake
deleted file mode 100644
index 17a47b9..0000000
--- a/cmake/perlasm.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-macro(append_to_parent_scope var)
- list(APPEND ${var} ${ARGN})
- set(${var} "${${var}}" PARENT_SCOPE)
-endmacro()
-
-function(add_perlasm_target dest src)
- get_filename_component(dir ${dest} DIRECTORY)
- if(dir STREQUAL "")
- set(dir ".")
- endif()
-
- add_custom_command(
- OUTPUT ${dest}
- COMMAND ${CMAKE_COMMAND} -E make_directory ${dir}
- COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/${src} ${ARGN}
- ${dest}
- DEPENDS
- ${src}
- ${PROJECT_SOURCE_DIR}/crypto/perlasm/arm-xlate.pl
- ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86_64-xlate.pl
- ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86asm.pl
- ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86gas.pl
- ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86masm.pl
- ${PROJECT_SOURCE_DIR}/crypto/perlasm/x86nasm.pl
- WORKING_DIRECTORY .
- )
-endfunction()
-
-# perlasm generates perlasm output from a given file. arch specifies the
-# architecture. dest specifies the basename of the output file. The list of
-# generated files will be appended to ${var}_ASM and ${var}_NASM depending on
-# the assembler used. Extra arguments are passed to the perlasm script.
-function(perlasm var arch dest src)
- if(arch STREQUAL "aarch64")
- add_perlasm_target("${dest}-apple.S" ${src} ios64 ${ARGN})
- add_perlasm_target("${dest}-linux.S" ${src} linux64 ${ARGN})
- add_perlasm_target("${dest}-win.S" ${src} win64 ${ARGN})
- append_to_parent_scope("${var}_ASM" "${dest}-apple.S" "${dest}-linux.S" "${dest}-win.S")
- elseif(arch STREQUAL "arm")
- add_perlasm_target("${dest}-linux.S" ${src} linux32 ${ARGN})
- append_to_parent_scope("${var}_ASM" "${dest}-linux.S")
- elseif(arch STREQUAL "x86")
- add_perlasm_target("${dest}-apple.S" ${src} macosx -fPIC ${ARGN})
- add_perlasm_target("${dest}-linux.S" ${src} elf -fPIC ${ARGN})
- add_perlasm_target("${dest}-win.asm" ${src} win32n ${ARGN})
- append_to_parent_scope("${var}_ASM" "${dest}-apple.S" "${dest}-linux.S")
- append_to_parent_scope("${var}_NASM" "${dest}-win.asm")
- elseif(arch STREQUAL "x86_64")
- add_perlasm_target("${dest}-apple.S" ${src} macosx ${ARGN})
- add_perlasm_target("${dest}-linux.S" ${src} elf ${ARGN})
- add_perlasm_target("${dest}-win.asm" ${src} nasm ${ARGN})
- append_to_parent_scope("${var}_ASM" "${dest}-apple.S" "${dest}-linux.S")
- append_to_parent_scope("${var}_NASM" "${dest}-win.asm")
- else()
- message(FATAL_ERROR "Unknown perlasm architecture: $arch")
- endif()
-endfunction()
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index bc32ef0..dbed8cb 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -1,72 +1,9 @@
-set(
- CRYPTO_SOURCES_ASM
- curve25519/asm/x25519-asm-arm.S
- hrss/asm/poly_rq_mul.S
- poly1305/poly1305_arm_asm.S
- ../third_party/fiat/asm/fiat_curve25519_adx_mul.S
- ../third_party/fiat/asm/fiat_curve25519_adx_square.S
- ../third_party/fiat/asm/fiat_p256_adx_mul.S
- ../third_party/fiat/asm/fiat_p256_adx_sqr.S
-)
-perlasm(CRYPTO_SOURCES aarch64 chacha/chacha-armv8 chacha/asm/chacha-armv8.pl)
-perlasm(CRYPTO_SOURCES aarch64 cipher_extra/chacha20_poly1305_armv8 cipher_extra/asm/chacha20_poly1305_armv8.pl)
-perlasm(CRYPTO_SOURCES aarch64 test/trampoline-armv8 test/asm/trampoline-armv8.pl)
-perlasm(CRYPTO_SOURCES arm chacha/chacha-armv4 chacha/asm/chacha-armv4.pl)
-perlasm(CRYPTO_SOURCES arm test/trampoline-armv4 test/asm/trampoline-armv4.pl)
-perlasm(CRYPTO_SOURCES x86 chacha/chacha-x86 chacha/asm/chacha-x86.pl)
-perlasm(CRYPTO_SOURCES x86 test/trampoline-x86 test/asm/trampoline-x86.pl)
-perlasm(CRYPTO_SOURCES x86_64 chacha/chacha-x86_64 chacha/asm/chacha-x86_64.pl)
-perlasm(CRYPTO_SOURCES x86_64 cipher_extra/aes128gcmsiv-x86_64 cipher_extra/asm/aes128gcmsiv-x86_64.pl)
-perlasm(CRYPTO_SOURCES x86_64 cipher_extra/chacha20_poly1305_x86_64 cipher_extra/asm/chacha20_poly1305_x86_64.pl)
-perlasm(CRYPTO_SOURCES x86_64 test/trampoline-x86_64 test/asm/trampoline-x86_64.pl)
-
-perlasm(BCM_SOURCES aarch64 fipsmodule/aesv8-armv8 fipsmodule/aes/asm/aesv8-armx.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/aesv8-gcm-armv8 fipsmodule/modes/asm/aesv8-gcm-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/armv8-mont fipsmodule/bn/asm/armv8-mont.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/bn-armv8 fipsmodule/bn/asm/bn-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/ghash-neon-armv8 fipsmodule/modes/asm/ghash-neon-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/ghashv8-armv8 fipsmodule/modes/asm/ghashv8-armx.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/p256_beeu-armv8-asm fipsmodule/ec/asm/p256_beeu-armv8-asm.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/p256-armv8-asm fipsmodule/ec/asm/p256-armv8-asm.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/sha1-armv8 fipsmodule/sha/asm/sha1-armv8.pl)
-perlasm(BCM_SOURCES aarch64 fipsmodule/sha256-armv8 fipsmodule/sha/asm/sha512-armv8.pl sha256)
-perlasm(BCM_SOURCES aarch64 fipsmodule/sha512-armv8 fipsmodule/sha/asm/sha512-armv8.pl sha512)
-perlasm(BCM_SOURCES aarch64 fipsmodule/vpaes-armv8 fipsmodule/aes/asm/vpaes-armv8.pl)
-perlasm(BCM_SOURCES arm fipsmodule/aesv8-armv7 fipsmodule/aes/asm/aesv8-armx.pl)
-perlasm(BCM_SOURCES arm fipsmodule/armv4-mont fipsmodule/bn/asm/armv4-mont.pl)
-perlasm(BCM_SOURCES arm fipsmodule/bsaes-armv7 fipsmodule/aes/asm/bsaes-armv7.pl)
-perlasm(BCM_SOURCES arm fipsmodule/ghash-armv4 fipsmodule/modes/asm/ghash-armv4.pl)
-perlasm(BCM_SOURCES arm fipsmodule/ghashv8-armv7 fipsmodule/modes/asm/ghashv8-armx.pl)
-perlasm(BCM_SOURCES arm fipsmodule/sha1-armv4-large fipsmodule/sha/asm/sha1-armv4-large.pl)
-perlasm(BCM_SOURCES arm fipsmodule/sha256-armv4 fipsmodule/sha/asm/sha256-armv4.pl)
-perlasm(BCM_SOURCES arm fipsmodule/sha512-armv4 fipsmodule/sha/asm/sha512-armv4.pl)
-perlasm(BCM_SOURCES arm fipsmodule/vpaes-armv7 fipsmodule/aes/asm/vpaes-armv7.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/aesni-x86 fipsmodule/aes/asm/aesni-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/bn-586 fipsmodule/bn/asm/bn-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/co-586 fipsmodule/bn/asm/co-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/ghash-ssse3-x86 fipsmodule/modes/asm/ghash-ssse3-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/ghash-x86 fipsmodule/modes/asm/ghash-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/md5-586 fipsmodule/md5/asm/md5-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/sha1-586 fipsmodule/sha/asm/sha1-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/sha256-586 fipsmodule/sha/asm/sha256-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/sha512-586 fipsmodule/sha/asm/sha512-586.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/vpaes-x86 fipsmodule/aes/asm/vpaes-x86.pl)
-perlasm(BCM_SOURCES x86 fipsmodule/x86-mont fipsmodule/bn/asm/x86-mont.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/aesni-gcm-x86_64 fipsmodule/modes/asm/aesni-gcm-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/aesni-x86_64 fipsmodule/aes/asm/aesni-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/ghash-ssse3-x86_64 fipsmodule/modes/asm/ghash-ssse3-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/ghash-x86_64 fipsmodule/modes/asm/ghash-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/md5-x86_64 fipsmodule/md5/asm/md5-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/p256_beeu-x86_64-asm fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/p256-x86_64-asm fipsmodule/ec/asm/p256-x86_64-asm.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/rdrand-x86_64 fipsmodule/rand/asm/rdrand-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/rsaz-avx2 fipsmodule/bn/asm/rsaz-avx2.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/sha1-x86_64 fipsmodule/sha/asm/sha1-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/sha256-x86_64 fipsmodule/sha/asm/sha512-x86_64.pl sha256)
-perlasm(BCM_SOURCES x86_64 fipsmodule/sha512-x86_64 fipsmodule/sha/asm/sha512-x86_64.pl sha512)
-perlasm(BCM_SOURCES x86_64 fipsmodule/vpaes-x86_64 fipsmodule/aes/asm/vpaes-x86_64.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/x86_64-mont fipsmodule/bn/asm/x86_64-mont.pl)
-perlasm(BCM_SOURCES x86_64 fipsmodule/x86_64-mont5 fipsmodule/bn/asm/x86_64-mont5.pl)
+# TODO(crbug.com/boringssl/524): Avoid needing this transform by instead moving
+# this up a directory.
+list(TRANSFORM BCM_SOURCES_ASM PREPEND "../")
+list(TRANSFORM BCM_SOURCES_NASM PREPEND "../")
+list(TRANSFORM CRYPTO_SOURCES_ASM PREPEND "../")
+list(TRANSFORM CRYPTO_SOURCES_NASM PREPEND "../")
if(OPENSSL_ASM)
list(APPEND CRYPTO_SOURCES_ASM_USED ${CRYPTO_SOURCES_ASM})
@@ -77,37 +14,6 @@
list(APPEND BCM_SOURCES_ASM_USED ${BCM_SOURCES_NASM})
endif()
-add_custom_command(
- OUTPUT err_data.c
- COMMAND ${GO_EXECUTABLE} run err_data_generate.go > ${CMAKE_CURRENT_BINARY_DIR}/err_data.c
- DEPENDS
- err/err_data_generate.go
- err/asn1.errordata
- err/bio.errordata
- err/bn.errordata
- err/cipher.errordata
- err/conf.errordata
- err/dh.errordata
- err/digest.errordata
- err/dsa.errordata
- err/ecdh.errordata
- err/ecdsa.errordata
- err/ec.errordata
- err/engine.errordata
- err/evp.errordata
- err/hkdf.errordata
- err/obj.errordata
- err/pem.errordata
- err/pkcs7.errordata
- err/pkcs8.errordata
- err/rsa.errordata
- err/ssl.errordata
- err/trust_token.errordata
- err/x509.errordata
- err/x509v3.errordata
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/err
-)
-
if(FIPS_DELOCATE AND FIPS_SHARED)
message(FATAL_ERROR "Can't set both delocate and shared mode for FIPS build")
endif()
@@ -128,9 +34,9 @@
add_custom_command(
OUTPUT bcm-delocated.S
COMMAND
- ./delocate
+ ${CMAKE_CURRENT_BINARY_DIR}/delocate
-a $<TARGET_FILE:bcm_c_generated_asm>
- -o bcm-delocated.S
+ -o ${CMAKE_CURRENT_BINARY_DIR}/bcm-delocated.S
-cc ${CMAKE_ASM_COMPILER}
-cc-flags "${TARGET_FLAG} ${CMAKE_ASM_FLAGS}"
${PROJECT_SOURCE_DIR}/include/openssl/arm_arch.h
@@ -144,7 +50,7 @@
${PROJECT_SOURCE_DIR}/include/openssl/arm_arch.h
${PROJECT_SOURCE_DIR}/include/openssl/asm_base.h
${PROJECT_SOURCE_DIR}/include/openssl/target.h
- WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
add_library(bcm_hashunset STATIC bcm-delocated.S)
@@ -272,7 +178,6 @@
ec_extra/ec_derive.c
ec_extra/hash_to_curve.c
err/err.c
- err_data.c
engine/engine.c
evp/evp.c
evp/evp_asn1.c
@@ -419,6 +324,9 @@
x509/x509name.c
x509/x509rset.c
x509/x509spki.c
+ # TOOD(crbug.com/boringssl/542): Pick up this and the rest of the source list
+ # from util/pregenerate.
+ ../gen/crypto/err_data.c
${CRYPTO_FIPS_OBJECTS}
${CRYPTO_SOURCES_ASM_USED}
diff --git a/gen/README.md b/gen/README.md
new file mode 100644
index 0000000..3ab6ec4
--- /dev/null
+++ b/gen/README.md
@@ -0,0 +1,26 @@
+# Pre-generated files
+
+This directory contains a number of pre-generated build artifacts. To simplify
+downstream builds, they are checked into the repository, rather than dynamically
+generated as part of the build.
+
+When developing on BoringSSL, if any inputs to these files are modified, callers
+must run the following command to update the generated files:
+
+ go run ./util/pregenerate
+
+To check that files are up-to-date without updating files, run:
+
+ go run ./util/pregenerate -check
+
+This is run on CI to ensure the generated files remain up-to-date.
+
+To speed up local iteration, the tool accepts additional arguments to filter the
+files generated. For example, if editing `aesni-x86_64.pl`, this
+command will only update files with "aesni-x86_64" as a substring.
+
+ go run ./util/pregenerate aesni-x86_64
+
+For convenience, all files in this directory, including this README, are managed
+by the tool. This means the whole directory may be deleted and regenerated from
+scratch at any time.
diff --git a/gen/bcm/aesni-gcm-x86_64-apple.S b/gen/bcm/aesni-gcm-x86_64-apple.S
new file mode 100644
index 0000000..e1247bc
--- /dev/null
+++ b/gen/bcm/aesni-gcm-x86_64-apple.S
@@ -0,0 +1,868 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+.p2align 5
+_aesni_ctr32_ghash_6x:
+
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp L$oop6x
+
+.p2align 5
+L$oop6x:
+ addl $100663296,%ebx
+ jc L$handle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+L$resume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%r10d
+ jb L$enc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je L$enc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp L$enc_tail
+
+.p2align 5
+L$handle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp L$resume_ctr32
+
+.p2align 5
+L$enc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+
+ prefetcht0 512(%rdi)
+ prefetcht0 576(%rdi)
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%rax
+ subq $0x6,%rdx
+ jc L$6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp L$oop6x
+
+L$6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ ret
+
+
+.globl _aesni_gcm_decrypt
+.private_extern _aesni_gcm_decrypt
+
+.p2align 5
+_aesni_gcm_decrypt:
+
+
+_CET_ENDBR
+ xorq %rax,%rax
+
+
+
+ cmpq $0x60,%rdx
+ jb L$gcm_dec_abort
+
+ pushq %rbp
+
+
+ movq %rsp,%rbp
+
+ pushq %rbx
+
+
+ pushq %r12
+
+
+ pushq %r13
+
+
+ pushq %r14
+
+
+ pushq %r15
+
+
+ vzeroupper
+
+ movq 16(%rbp),%r12
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq L$bswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r12),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32(%r9),%r9
+ movl 240-128(%rcx),%r10d
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc L$dec_no_key_aliasing
+ cmpq $768,%r15
+ jnc L$dec_no_key_aliasing
+ subq %r15,%rsp
+L$dec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ movq %rdi,%r14
+ vmovdqu 64(%rdi),%xmm4
+
+
+
+
+
+
+
+ leaq -192(%rdi,%rdx,1),%r15
+
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %rax,%rax
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ movq 16(%rbp),%r12
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12)
+
+ vzeroupper
+ leaq -40(%rbp),%rsp
+
+ popq %r15
+
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ popq %rbx
+
+ popq %rbp
+
+L$gcm_dec_abort:
+ ret
+
+
+
+
+.p2align 5
+_aesni_ctr32_6x:
+
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%r10),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc L$handle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp L$oop_ctr32
+
+.p2align 4
+L$oop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz L$oop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ ret
+.p2align 5
+L$handle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp L$oop_ctr32
+
+
+
+.globl _aesni_gcm_encrypt
+.private_extern _aesni_gcm_encrypt
+
+.p2align 5
+_aesni_gcm_encrypt:
+
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+ movb $1,_BORINGSSL_function_hit+2(%rip)
+#endif
+ xorq %rax,%rax
+
+
+
+
+ cmpq $288,%rdx
+ jb L$gcm_enc_abort
+
+ pushq %rbp
+
+
+ movq %rsp,%rbp
+
+ pushq %rbx
+
+
+ pushq %r12
+
+
+ pushq %r13
+
+
+ pushq %r14
+
+
+ pushq %r15
+
+
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq L$bswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%r10d
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc L$enc_no_key_aliasing
+ cmpq $768,%r15
+ jnc L$enc_no_key_aliasing
+ subq %r15,%rsp
+L$enc_no_key_aliasing:
+
+ movq %rsi,%r14
+
+
+
+
+
+
+
+
+ leaq -192(%rsi,%rdx,1),%r15
+
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ movq 16(%rbp),%r12
+ leaq 32(%r9),%r9
+ vmovdqu (%r12),%xmm8
+ subq $12,%rdx
+ movq $192,%rax
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ movq 16(%rbp),%r12
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12)
+
+ vzeroupper
+ leaq -40(%rbp),%rsp
+
+ popq %r15
+
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ popq %rbx
+
+ popq %rbp
+
+L$gcm_enc_abort:
+ ret
+
+
+
+.section __DATA,__const
+.p2align 6
+L$bswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$poly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$one_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$two_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+L$one_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+.text
+#endif
diff --git a/gen/bcm/aesni-gcm-x86_64-linux.S b/gen/bcm/aesni-gcm-x86_64-linux.S
new file mode 100644
index 0000000..774a8d1
--- /dev/null
+++ b/gen/bcm/aesni-gcm-x86_64-linux.S
@@ -0,0 +1,883 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%r10d
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+
+ prefetcht0 512(%rdi)
+ prefetcht0 576(%rdi)
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%rax
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ ret
+.cfi_endproc
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl aesni_gcm_decrypt
+.hidden aesni_gcm_decrypt
+.type aesni_gcm_decrypt,@function
+.align 32
+aesni_gcm_decrypt:
+.cfi_startproc
+
+_CET_ENDBR
+ xorq %rax,%rax
+
+
+
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+
+ movq %rsp,%rbp
+.cfi_def_cfa_register %rbp
+ pushq %rbx
+.cfi_offset %rbx,-24
+
+ pushq %r12
+.cfi_offset %r12,-32
+
+ pushq %r13
+.cfi_offset %r13,-40
+
+ pushq %r14
+.cfi_offset %r14,-48
+
+ pushq %r15
+.cfi_offset %r15,-56
+
+ vzeroupper
+
+ movq 16(%rbp),%r12
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r12),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32(%r9),%r9
+ movl 240-128(%rcx),%r10d
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ movq %rdi,%r14
+ vmovdqu 64(%rdi),%xmm4
+
+
+
+
+
+
+
+ leaq -192(%rdi,%rdx,1),%r15
+
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %rax,%rax
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ movq 16(%rbp),%r12
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12)
+
+ vzeroupper
+ leaq -40(%rbp),%rsp
+.cfi_def_cfa %rsp, 0x38
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+.Lgcm_dec_abort:
+ ret
+
+.cfi_endproc
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+.cfi_startproc
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%r10),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ ret
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.cfi_endproc
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.hidden aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+.cfi_startproc
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+2(%rip)
+#endif
+ xorq %rax,%rax
+
+
+
+
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+
+ movq %rsp,%rbp
+.cfi_def_cfa_register %rbp
+ pushq %rbx
+.cfi_offset %rbx,-24
+
+ pushq %r12
+.cfi_offset %r12,-32
+
+ pushq %r13
+.cfi_offset %r13,-40
+
+ pushq %r14
+.cfi_offset %r14,-48
+
+ pushq %r15
+.cfi_offset %r15,-56
+
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%r10d
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ movq %rsi,%r14
+
+
+
+
+
+
+
+
+ leaq -192(%rsi,%rdx,1),%r15
+
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ movq 16(%rbp),%r12
+ leaq 32(%r9),%r9
+ vmovdqu (%r12),%xmm8
+ subq $12,%rdx
+ movq $192,%rax
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ movq 16(%rbp),%r12
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12)
+
+ vzeroupper
+ leaq -40(%rbp),%rsp
+.cfi_def_cfa %rsp, 0x38
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+.Lgcm_enc_abort:
+ ret
+
+.cfi_endproc
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.section .rodata
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+.text
+#endif
diff --git a/gen/bcm/aesni-gcm-x86_64-win.asm b/gen/bcm/aesni-gcm-x86_64-win.asm
new file mode 100644
index 0000000..d7a2665
--- /dev/null
+++ b/gen/bcm/aesni-gcm-x86_64-win.asm
@@ -0,0 +1,1101 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+
+ALIGN 32
+_aesni_ctr32_ghash_6x:
+
+ vmovdqu xmm2,XMMWORD[32+r11]
+ sub r8,6
+ vpxor xmm4,xmm4,xmm4
+ vmovdqu xmm15,XMMWORD[((0-128))+r9]
+ vpaddb xmm10,xmm1,xmm2
+ vpaddb xmm11,xmm10,xmm2
+ vpaddb xmm12,xmm11,xmm2
+ vpaddb xmm13,xmm12,xmm2
+ vpaddb xmm14,xmm13,xmm2
+ vpxor xmm9,xmm1,xmm15
+ vmovdqu XMMWORD[(16+8)+rsp],xmm4
+ jmp NEAR $L$oop6x
+
+ALIGN 32
+$L$oop6x:
+ add ebx,100663296
+ jc NEAR $L$handle_ctr32
+ vmovdqu xmm3,XMMWORD[((0-32))+rsi]
+ vpaddb xmm1,xmm14,xmm2
+ vpxor xmm10,xmm10,xmm15
+ vpxor xmm11,xmm11,xmm15
+
+$L$resume_ctr32:
+ vmovdqu XMMWORD[rdi],xmm1
+ vpclmulqdq xmm5,xmm7,xmm3,0x10
+ vpxor xmm12,xmm12,xmm15
+ vmovups xmm2,XMMWORD[((16-128))+r9]
+ vpclmulqdq xmm6,xmm7,xmm3,0x01
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ xor r12,r12
+ cmp r15,r14
+
+ vaesenc xmm9,xmm9,xmm2
+ vmovdqu xmm0,XMMWORD[((48+8))+rsp]
+ vpxor xmm13,xmm13,xmm15
+ vpclmulqdq xmm1,xmm7,xmm3,0x00
+ vaesenc xmm10,xmm10,xmm2
+ vpxor xmm14,xmm14,xmm15
+ setnc r12b
+ vpclmulqdq xmm7,xmm7,xmm3,0x11
+ vaesenc xmm11,xmm11,xmm2
+ vmovdqu xmm3,XMMWORD[((16-32))+rsi]
+ neg r12
+ vaesenc xmm12,xmm12,xmm2
+ vpxor xmm6,xmm6,xmm5
+ vpclmulqdq xmm5,xmm0,xmm3,0x00
+ vpxor xmm8,xmm8,xmm4
+ vaesenc xmm13,xmm13,xmm2
+ vpxor xmm4,xmm1,xmm5
+ and r12,0x60
+ vmovups xmm15,XMMWORD[((32-128))+r9]
+ vpclmulqdq xmm1,xmm0,xmm3,0x10
+ vaesenc xmm14,xmm14,xmm2
+
+ vpclmulqdq xmm2,xmm0,xmm3,0x01
+ lea r14,[r12*1+r14]
+ vaesenc xmm9,xmm9,xmm15
+ vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp]
+ vpclmulqdq xmm3,xmm0,xmm3,0x11
+ vmovdqu xmm0,XMMWORD[((64+8))+rsp]
+ vaesenc xmm10,xmm10,xmm15
+ movbe r13,QWORD[88+r14]
+ vaesenc xmm11,xmm11,xmm15
+ movbe r12,QWORD[80+r14]
+ vaesenc xmm12,xmm12,xmm15
+ mov QWORD[((32+8))+rsp],r13
+ vaesenc xmm13,xmm13,xmm15
+ mov QWORD[((40+8))+rsp],r12
+ vmovdqu xmm5,XMMWORD[((48-32))+rsi]
+ vaesenc xmm14,xmm14,xmm15
+
+ vmovups xmm15,XMMWORD[((48-128))+r9]
+ vpxor xmm6,xmm6,xmm1
+ vpclmulqdq xmm1,xmm0,xmm5,0x00
+ vaesenc xmm9,xmm9,xmm15
+ vpxor xmm6,xmm6,xmm2
+ vpclmulqdq xmm2,xmm0,xmm5,0x10
+ vaesenc xmm10,xmm10,xmm15
+ vpxor xmm7,xmm7,xmm3
+ vpclmulqdq xmm3,xmm0,xmm5,0x01
+ vaesenc xmm11,xmm11,xmm15
+ vpclmulqdq xmm5,xmm0,xmm5,0x11
+ vmovdqu xmm0,XMMWORD[((80+8))+rsp]
+ vaesenc xmm12,xmm12,xmm15
+ vaesenc xmm13,xmm13,xmm15
+ vpxor xmm4,xmm4,xmm1
+ vmovdqu xmm1,XMMWORD[((64-32))+rsi]
+ vaesenc xmm14,xmm14,xmm15
+
+ vmovups xmm15,XMMWORD[((64-128))+r9]
+ vpxor xmm6,xmm6,xmm2
+ vpclmulqdq xmm2,xmm0,xmm1,0x00
+ vaesenc xmm9,xmm9,xmm15
+ vpxor xmm6,xmm6,xmm3
+ vpclmulqdq xmm3,xmm0,xmm1,0x10
+ vaesenc xmm10,xmm10,xmm15
+ movbe r13,QWORD[72+r14]
+ vpxor xmm7,xmm7,xmm5
+ vpclmulqdq xmm5,xmm0,xmm1,0x01
+ vaesenc xmm11,xmm11,xmm15
+ movbe r12,QWORD[64+r14]
+ vpclmulqdq xmm1,xmm0,xmm1,0x11
+ vmovdqu xmm0,XMMWORD[((96+8))+rsp]
+ vaesenc xmm12,xmm12,xmm15
+ mov QWORD[((48+8))+rsp],r13
+ vaesenc xmm13,xmm13,xmm15
+ mov QWORD[((56+8))+rsp],r12
+ vpxor xmm4,xmm4,xmm2
+ vmovdqu xmm2,XMMWORD[((96-32))+rsi]
+ vaesenc xmm14,xmm14,xmm15
+
+ vmovups xmm15,XMMWORD[((80-128))+r9]
+ vpxor xmm6,xmm6,xmm3
+ vpclmulqdq xmm3,xmm0,xmm2,0x00
+ vaesenc xmm9,xmm9,xmm15
+ vpxor xmm6,xmm6,xmm5
+ vpclmulqdq xmm5,xmm0,xmm2,0x10
+ vaesenc xmm10,xmm10,xmm15
+ movbe r13,QWORD[56+r14]
+ vpxor xmm7,xmm7,xmm1
+ vpclmulqdq xmm1,xmm0,xmm2,0x01
+ vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp]
+ vaesenc xmm11,xmm11,xmm15
+ movbe r12,QWORD[48+r14]
+ vpclmulqdq xmm2,xmm0,xmm2,0x11
+ vaesenc xmm12,xmm12,xmm15
+ mov QWORD[((64+8))+rsp],r13
+ vaesenc xmm13,xmm13,xmm15
+ mov QWORD[((72+8))+rsp],r12
+ vpxor xmm4,xmm4,xmm3
+ vmovdqu xmm3,XMMWORD[((112-32))+rsi]
+ vaesenc xmm14,xmm14,xmm15
+
+ vmovups xmm15,XMMWORD[((96-128))+r9]
+ vpxor xmm6,xmm6,xmm5
+ vpclmulqdq xmm5,xmm8,xmm3,0x10
+ vaesenc xmm9,xmm9,xmm15
+ vpxor xmm6,xmm6,xmm1
+ vpclmulqdq xmm1,xmm8,xmm3,0x01
+ vaesenc xmm10,xmm10,xmm15
+ movbe r13,QWORD[40+r14]
+ vpxor xmm7,xmm7,xmm2
+ vpclmulqdq xmm2,xmm8,xmm3,0x00
+ vaesenc xmm11,xmm11,xmm15
+ movbe r12,QWORD[32+r14]
+ vpclmulqdq xmm8,xmm8,xmm3,0x11
+ vaesenc xmm12,xmm12,xmm15
+ mov QWORD[((80+8))+rsp],r13
+ vaesenc xmm13,xmm13,xmm15
+ mov QWORD[((88+8))+rsp],r12
+ vpxor xmm6,xmm6,xmm5
+ vaesenc xmm14,xmm14,xmm15
+ vpxor xmm6,xmm6,xmm1
+
+ vmovups xmm15,XMMWORD[((112-128))+r9]
+ vpslldq xmm5,xmm6,8
+ vpxor xmm4,xmm4,xmm2
+ vmovdqu xmm3,XMMWORD[16+r11]
+
+ vaesenc xmm9,xmm9,xmm15
+ vpxor xmm7,xmm7,xmm8
+ vaesenc xmm10,xmm10,xmm15
+ vpxor xmm4,xmm4,xmm5
+ movbe r13,QWORD[24+r14]
+ vaesenc xmm11,xmm11,xmm15
+ movbe r12,QWORD[16+r14]
+ vpalignr xmm0,xmm4,xmm4,8
+ vpclmulqdq xmm4,xmm4,xmm3,0x10
+ mov QWORD[((96+8))+rsp],r13
+ vaesenc xmm12,xmm12,xmm15
+ mov QWORD[((104+8))+rsp],r12
+ vaesenc xmm13,xmm13,xmm15
+ vmovups xmm1,XMMWORD[((128-128))+r9]
+ vaesenc xmm14,xmm14,xmm15
+
+ vaesenc xmm9,xmm9,xmm1
+ vmovups xmm15,XMMWORD[((144-128))+r9]
+ vaesenc xmm10,xmm10,xmm1
+ vpsrldq xmm6,xmm6,8
+ vaesenc xmm11,xmm11,xmm1
+ vpxor xmm7,xmm7,xmm6
+ vaesenc xmm12,xmm12,xmm1
+ vpxor xmm4,xmm4,xmm0
+ movbe r13,QWORD[8+r14]
+ vaesenc xmm13,xmm13,xmm1
+ movbe r12,QWORD[r14]
+ vaesenc xmm14,xmm14,xmm1
+ vmovups xmm1,XMMWORD[((160-128))+r9]
+ cmp r10d,11
+ jb NEAR $L$enc_tail
+
+ vaesenc xmm9,xmm9,xmm15
+ vaesenc xmm10,xmm10,xmm15
+ vaesenc xmm11,xmm11,xmm15
+ vaesenc xmm12,xmm12,xmm15
+ vaesenc xmm13,xmm13,xmm15
+ vaesenc xmm14,xmm14,xmm15
+
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+ vmovups xmm15,XMMWORD[((176-128))+r9]
+ vaesenc xmm14,xmm14,xmm1
+ vmovups xmm1,XMMWORD[((192-128))+r9]
+ je NEAR $L$enc_tail
+
+ vaesenc xmm9,xmm9,xmm15
+ vaesenc xmm10,xmm10,xmm15
+ vaesenc xmm11,xmm11,xmm15
+ vaesenc xmm12,xmm12,xmm15
+ vaesenc xmm13,xmm13,xmm15
+ vaesenc xmm14,xmm14,xmm15
+
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+ vmovups xmm15,XMMWORD[((208-128))+r9]
+ vaesenc xmm14,xmm14,xmm1
+ vmovups xmm1,XMMWORD[((224-128))+r9]
+ jmp NEAR $L$enc_tail
+
+ALIGN 32
+$L$handle_ctr32:
+ vmovdqu xmm0,XMMWORD[r11]
+ vpshufb xmm6,xmm1,xmm0
+ vmovdqu xmm5,XMMWORD[48+r11]
+ vpaddd xmm10,xmm6,XMMWORD[64+r11]
+ vpaddd xmm11,xmm6,xmm5
+ vmovdqu xmm3,XMMWORD[((0-32))+rsi]
+ vpaddd xmm12,xmm10,xmm5
+ vpshufb xmm10,xmm10,xmm0
+ vpaddd xmm13,xmm11,xmm5
+ vpshufb xmm11,xmm11,xmm0
+ vpxor xmm10,xmm10,xmm15
+ vpaddd xmm14,xmm12,xmm5
+ vpshufb xmm12,xmm12,xmm0
+ vpxor xmm11,xmm11,xmm15
+ vpaddd xmm1,xmm13,xmm5
+ vpshufb xmm13,xmm13,xmm0
+ vpshufb xmm14,xmm14,xmm0
+ vpshufb xmm1,xmm1,xmm0
+ jmp NEAR $L$resume_ctr32
+
+ALIGN 32
+$L$enc_tail:
+ vaesenc xmm9,xmm9,xmm15
+ vmovdqu XMMWORD[(16+8)+rsp],xmm7
+ vpalignr xmm8,xmm4,xmm4,8
+ vaesenc xmm10,xmm10,xmm15
+ vpclmulqdq xmm4,xmm4,xmm3,0x10
+ vpxor xmm2,xmm1,XMMWORD[rcx]
+ vaesenc xmm11,xmm11,xmm15
+ vpxor xmm0,xmm1,XMMWORD[16+rcx]
+ vaesenc xmm12,xmm12,xmm15
+ vpxor xmm5,xmm1,XMMWORD[32+rcx]
+ vaesenc xmm13,xmm13,xmm15
+ vpxor xmm6,xmm1,XMMWORD[48+rcx]
+ vaesenc xmm14,xmm14,xmm15
+ vpxor xmm7,xmm1,XMMWORD[64+rcx]
+ vpxor xmm3,xmm1,XMMWORD[80+rcx]
+ vmovdqu xmm1,XMMWORD[rdi]
+
+ vaesenclast xmm9,xmm9,xmm2
+ vmovdqu xmm2,XMMWORD[32+r11]
+ vaesenclast xmm10,xmm10,xmm0
+ vpaddb xmm0,xmm1,xmm2
+ mov QWORD[((112+8))+rsp],r13
+ lea rcx,[96+rcx]
+
+ prefetcht0 [512+rcx]
+ prefetcht0 [576+rcx]
+ vaesenclast xmm11,xmm11,xmm5
+ vpaddb xmm5,xmm0,xmm2
+ mov QWORD[((120+8))+rsp],r12
+ lea rdx,[96+rdx]
+ vmovdqu xmm15,XMMWORD[((0-128))+r9]
+ vaesenclast xmm12,xmm12,xmm6
+ vpaddb xmm6,xmm5,xmm2
+ vaesenclast xmm13,xmm13,xmm7
+ vpaddb xmm7,xmm6,xmm2
+ vaesenclast xmm14,xmm14,xmm3
+ vpaddb xmm3,xmm7,xmm2
+
+ add rax,0x60
+ sub r8,0x6
+ jc NEAR $L$6x_done
+
+ vmovups XMMWORD[(-96)+rdx],xmm9
+ vpxor xmm9,xmm1,xmm15
+ vmovups XMMWORD[(-80)+rdx],xmm10
+ vmovdqa xmm10,xmm0
+ vmovups XMMWORD[(-64)+rdx],xmm11
+ vmovdqa xmm11,xmm5
+ vmovups XMMWORD[(-48)+rdx],xmm12
+ vmovdqa xmm12,xmm6
+ vmovups XMMWORD[(-32)+rdx],xmm13
+ vmovdqa xmm13,xmm7
+ vmovups XMMWORD[(-16)+rdx],xmm14
+ vmovdqa xmm14,xmm3
+ vmovdqu xmm7,XMMWORD[((32+8))+rsp]
+ jmp NEAR $L$oop6x
+
+$L$6x_done:
+ vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp]
+ vpxor xmm8,xmm8,xmm4
+
+ ret
+
+
+global aesni_gcm_decrypt
+
+ALIGN 32
+aesni_gcm_decrypt:
+
+$L$SEH_begin_aesni_gcm_decrypt_1:
+_CET_ENDBR
+ xor rax,rax
+
+
+
+ cmp r8,0x60
+ jb NEAR $L$gcm_dec_abort
+
+ push rbp
+
+$L$SEH_prolog_aesni_gcm_decrypt_2:
+ mov rbp,rsp
+
+ push rbx
+
+$L$SEH_prolog_aesni_gcm_decrypt_3:
+ push r12
+
+$L$SEH_prolog_aesni_gcm_decrypt_4:
+ push r13
+
+$L$SEH_prolog_aesni_gcm_decrypt_5:
+ push r14
+
+$L$SEH_prolog_aesni_gcm_decrypt_6:
+ push r15
+
+$L$SEH_prolog_aesni_gcm_decrypt_7:
+ lea rsp,[((-168))+rsp]
+$L$SEH_prolog_aesni_gcm_decrypt_8:
+$L$SEH_prolog_aesni_gcm_decrypt_9:
+
+
+
+ mov QWORD[16+rbp],rdi
+$L$SEH_prolog_aesni_gcm_decrypt_10:
+ mov QWORD[24+rbp],rsi
+$L$SEH_prolog_aesni_gcm_decrypt_11:
+ mov rdi,QWORD[48+rbp]
+ mov rsi,QWORD[56+rbp]
+
+ movaps XMMWORD[(-208)+rbp],xmm6
+$L$SEH_prolog_aesni_gcm_decrypt_12:
+ movaps XMMWORD[(-192)+rbp],xmm7
+$L$SEH_prolog_aesni_gcm_decrypt_13:
+ movaps XMMWORD[(-176)+rbp],xmm8
+$L$SEH_prolog_aesni_gcm_decrypt_14:
+ movaps XMMWORD[(-160)+rbp],xmm9
+$L$SEH_prolog_aesni_gcm_decrypt_15:
+ movaps XMMWORD[(-144)+rbp],xmm10
+$L$SEH_prolog_aesni_gcm_decrypt_16:
+ movaps XMMWORD[(-128)+rbp],xmm11
+$L$SEH_prolog_aesni_gcm_decrypt_17:
+ movaps XMMWORD[(-112)+rbp],xmm12
+$L$SEH_prolog_aesni_gcm_decrypt_18:
+ movaps XMMWORD[(-96)+rbp],xmm13
+$L$SEH_prolog_aesni_gcm_decrypt_19:
+ movaps XMMWORD[(-80)+rbp],xmm14
+$L$SEH_prolog_aesni_gcm_decrypt_20:
+ movaps XMMWORD[(-64)+rbp],xmm15
+$L$SEH_prolog_aesni_gcm_decrypt_21:
+ vzeroupper
+
+ mov r12,QWORD[64+rbp]
+ vmovdqu xmm1,XMMWORD[rdi]
+ add rsp,-128
+ mov ebx,DWORD[12+rdi]
+ lea r11,[$L$bswap_mask]
+ lea r14,[((-128))+r9]
+ mov r15,0xf80
+ vmovdqu xmm8,XMMWORD[r12]
+ and rsp,-128
+ vmovdqu xmm0,XMMWORD[r11]
+ lea r9,[128+r9]
+ lea rsi,[32+rsi]
+ mov r10d,DWORD[((240-128))+r9]
+ vpshufb xmm8,xmm8,xmm0
+
+ and r14,r15
+ and r15,rsp
+ sub r15,r14
+ jc NEAR $L$dec_no_key_aliasing
+ cmp r15,768
+ jnc NEAR $L$dec_no_key_aliasing
+ sub rsp,r15
+$L$dec_no_key_aliasing:
+
+ vmovdqu xmm7,XMMWORD[80+rcx]
+ mov r14,rcx
+ vmovdqu xmm4,XMMWORD[64+rcx]
+
+
+
+
+
+
+
+ lea r15,[((-192))+r8*1+rcx]
+
+ vmovdqu xmm5,XMMWORD[48+rcx]
+ shr r8,4
+ xor rax,rax
+ vmovdqu xmm6,XMMWORD[32+rcx]
+ vpshufb xmm7,xmm7,xmm0
+ vmovdqu xmm2,XMMWORD[16+rcx]
+ vpshufb xmm4,xmm4,xmm0
+ vmovdqu xmm3,XMMWORD[rcx]
+ vpshufb xmm5,xmm5,xmm0
+ vmovdqu XMMWORD[48+rsp],xmm4
+ vpshufb xmm6,xmm6,xmm0
+ vmovdqu XMMWORD[64+rsp],xmm5
+ vpshufb xmm2,xmm2,xmm0
+ vmovdqu XMMWORD[80+rsp],xmm6
+ vpshufb xmm3,xmm3,xmm0
+ vmovdqu XMMWORD[96+rsp],xmm2
+ vmovdqu XMMWORD[112+rsp],xmm3
+
+ call _aesni_ctr32_ghash_6x
+
+ mov r12,QWORD[64+rbp]
+ vmovups XMMWORD[(-96)+rdx],xmm9
+ vmovups XMMWORD[(-80)+rdx],xmm10
+ vmovups XMMWORD[(-64)+rdx],xmm11
+ vmovups XMMWORD[(-48)+rdx],xmm12
+ vmovups XMMWORD[(-32)+rdx],xmm13
+ vmovups XMMWORD[(-16)+rdx],xmm14
+
+ vpshufb xmm8,xmm8,XMMWORD[r11]
+ vmovdqu XMMWORD[r12],xmm8
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((-208))+rbp]
+ movaps xmm7,XMMWORD[((-192))+rbp]
+ movaps xmm8,XMMWORD[((-176))+rbp]
+ movaps xmm9,XMMWORD[((-160))+rbp]
+ movaps xmm10,XMMWORD[((-144))+rbp]
+ movaps xmm11,XMMWORD[((-128))+rbp]
+ movaps xmm12,XMMWORD[((-112))+rbp]
+ movaps xmm13,XMMWORD[((-96))+rbp]
+ movaps xmm14,XMMWORD[((-80))+rbp]
+ movaps xmm15,XMMWORD[((-64))+rbp]
+ mov rdi,QWORD[16+rbp]
+ mov rsi,QWORD[24+rbp]
+ lea rsp,[((-40))+rbp]
+
+ pop r15
+
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ pop rbx
+
+ pop rbp
+
+$L$gcm_dec_abort:
+ ret
+$L$SEH_end_aesni_gcm_decrypt_22:
+
+
+
+ALIGN 32
+_aesni_ctr32_6x:
+
+ vmovdqu xmm4,XMMWORD[((0-128))+r9]
+ vmovdqu xmm2,XMMWORD[32+r11]
+ lea r13,[((-1))+r10]
+ vmovups xmm15,XMMWORD[((16-128))+r9]
+ lea r12,[((32-128))+r9]
+ vpxor xmm9,xmm1,xmm4
+ add ebx,100663296
+ jc NEAR $L$handle_ctr32_2
+ vpaddb xmm10,xmm1,xmm2
+ vpaddb xmm11,xmm10,xmm2
+ vpxor xmm10,xmm10,xmm4
+ vpaddb xmm12,xmm11,xmm2
+ vpxor xmm11,xmm11,xmm4
+ vpaddb xmm13,xmm12,xmm2
+ vpxor xmm12,xmm12,xmm4
+ vpaddb xmm14,xmm13,xmm2
+ vpxor xmm13,xmm13,xmm4
+ vpaddb xmm1,xmm14,xmm2
+ vpxor xmm14,xmm14,xmm4
+ jmp NEAR $L$oop_ctr32
+
+ALIGN 16
+$L$oop_ctr32:
+ vaesenc xmm9,xmm9,xmm15
+ vaesenc xmm10,xmm10,xmm15
+ vaesenc xmm11,xmm11,xmm15
+ vaesenc xmm12,xmm12,xmm15
+ vaesenc xmm13,xmm13,xmm15
+ vaesenc xmm14,xmm14,xmm15
+ vmovups xmm15,XMMWORD[r12]
+ lea r12,[16+r12]
+ dec r13d
+ jnz NEAR $L$oop_ctr32
+
+ vmovdqu xmm3,XMMWORD[r12]
+ vaesenc xmm9,xmm9,xmm15
+ vpxor xmm4,xmm3,XMMWORD[rcx]
+ vaesenc xmm10,xmm10,xmm15
+ vpxor xmm5,xmm3,XMMWORD[16+rcx]
+ vaesenc xmm11,xmm11,xmm15
+ vpxor xmm6,xmm3,XMMWORD[32+rcx]
+ vaesenc xmm12,xmm12,xmm15
+ vpxor xmm8,xmm3,XMMWORD[48+rcx]
+ vaesenc xmm13,xmm13,xmm15
+ vpxor xmm2,xmm3,XMMWORD[64+rcx]
+ vaesenc xmm14,xmm14,xmm15
+ vpxor xmm3,xmm3,XMMWORD[80+rcx]
+ lea rcx,[96+rcx]
+
+ vaesenclast xmm9,xmm9,xmm4
+ vaesenclast xmm10,xmm10,xmm5
+ vaesenclast xmm11,xmm11,xmm6
+ vaesenclast xmm12,xmm12,xmm8
+ vaesenclast xmm13,xmm13,xmm2
+ vaesenclast xmm14,xmm14,xmm3
+ vmovups XMMWORD[rdx],xmm9
+ vmovups XMMWORD[16+rdx],xmm10
+ vmovups XMMWORD[32+rdx],xmm11
+ vmovups XMMWORD[48+rdx],xmm12
+ vmovups XMMWORD[64+rdx],xmm13
+ vmovups XMMWORD[80+rdx],xmm14
+ lea rdx,[96+rdx]
+
+ ret
+ALIGN 32
+$L$handle_ctr32_2:
+ vpshufb xmm6,xmm1,xmm0
+ vmovdqu xmm5,XMMWORD[48+r11]
+ vpaddd xmm10,xmm6,XMMWORD[64+r11]
+ vpaddd xmm11,xmm6,xmm5
+ vpaddd xmm12,xmm10,xmm5
+ vpshufb xmm10,xmm10,xmm0
+ vpaddd xmm13,xmm11,xmm5
+ vpshufb xmm11,xmm11,xmm0
+ vpxor xmm10,xmm10,xmm4
+ vpaddd xmm14,xmm12,xmm5
+ vpshufb xmm12,xmm12,xmm0
+ vpxor xmm11,xmm11,xmm4
+ vpaddd xmm1,xmm13,xmm5
+ vpshufb xmm13,xmm13,xmm0
+ vpxor xmm12,xmm12,xmm4
+ vpshufb xmm14,xmm14,xmm0
+ vpxor xmm13,xmm13,xmm4
+ vpshufb xmm1,xmm1,xmm0
+ vpxor xmm14,xmm14,xmm4
+ jmp NEAR $L$oop_ctr32
+
+
+
+global aesni_gcm_encrypt
+
+ALIGN 32
+aesni_gcm_encrypt:
+
+$L$SEH_begin_aesni_gcm_encrypt_1:
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN BORINGSSL_function_hit
+ mov BYTE[((BORINGSSL_function_hit+2))],1
+%endif
+ xor rax,rax
+
+
+
+
+ cmp r8,0x60*3
+ jb NEAR $L$gcm_enc_abort
+
+ push rbp
+
+$L$SEH_prolog_aesni_gcm_encrypt_2:
+ mov rbp,rsp
+
+ push rbx
+
+$L$SEH_prolog_aesni_gcm_encrypt_3:
+ push r12
+
+$L$SEH_prolog_aesni_gcm_encrypt_4:
+ push r13
+
+$L$SEH_prolog_aesni_gcm_encrypt_5:
+ push r14
+
+$L$SEH_prolog_aesni_gcm_encrypt_6:
+ push r15
+
+$L$SEH_prolog_aesni_gcm_encrypt_7:
+ lea rsp,[((-168))+rsp]
+$L$SEH_prolog_aesni_gcm_encrypt_8:
+$L$SEH_prolog_aesni_gcm_encrypt_9:
+
+
+
+ mov QWORD[16+rbp],rdi
+$L$SEH_prolog_aesni_gcm_encrypt_10:
+ mov QWORD[24+rbp],rsi
+$L$SEH_prolog_aesni_gcm_encrypt_11:
+ mov rdi,QWORD[48+rbp]
+ mov rsi,QWORD[56+rbp]
+
+ movaps XMMWORD[(-208)+rbp],xmm6
+$L$SEH_prolog_aesni_gcm_encrypt_12:
+ movaps XMMWORD[(-192)+rbp],xmm7
+$L$SEH_prolog_aesni_gcm_encrypt_13:
+ movaps XMMWORD[(-176)+rbp],xmm8
+$L$SEH_prolog_aesni_gcm_encrypt_14:
+ movaps XMMWORD[(-160)+rbp],xmm9
+$L$SEH_prolog_aesni_gcm_encrypt_15:
+ movaps XMMWORD[(-144)+rbp],xmm10
+$L$SEH_prolog_aesni_gcm_encrypt_16:
+ movaps XMMWORD[(-128)+rbp],xmm11
+$L$SEH_prolog_aesni_gcm_encrypt_17:
+ movaps XMMWORD[(-112)+rbp],xmm12
+$L$SEH_prolog_aesni_gcm_encrypt_18:
+ movaps XMMWORD[(-96)+rbp],xmm13
+$L$SEH_prolog_aesni_gcm_encrypt_19:
+ movaps XMMWORD[(-80)+rbp],xmm14
+$L$SEH_prolog_aesni_gcm_encrypt_20:
+ movaps XMMWORD[(-64)+rbp],xmm15
+$L$SEH_prolog_aesni_gcm_encrypt_21:
+ vzeroupper
+
+ vmovdqu xmm1,XMMWORD[rdi]
+ add rsp,-128
+ mov ebx,DWORD[12+rdi]
+ lea r11,[$L$bswap_mask]
+ lea r14,[((-128))+r9]
+ mov r15,0xf80
+ lea r9,[128+r9]
+ vmovdqu xmm0,XMMWORD[r11]
+ and rsp,-128
+ mov r10d,DWORD[((240-128))+r9]
+
+ and r14,r15
+ and r15,rsp
+ sub r15,r14
+ jc NEAR $L$enc_no_key_aliasing
+ cmp r15,768
+ jnc NEAR $L$enc_no_key_aliasing
+ sub rsp,r15
+$L$enc_no_key_aliasing:
+
+ mov r14,rdx
+
+
+
+
+
+
+
+
+ lea r15,[((-192))+r8*1+rdx]
+
+ shr r8,4
+
+ call _aesni_ctr32_6x
+ vpshufb xmm8,xmm9,xmm0
+ vpshufb xmm2,xmm10,xmm0
+ vmovdqu XMMWORD[112+rsp],xmm8
+ vpshufb xmm4,xmm11,xmm0
+ vmovdqu XMMWORD[96+rsp],xmm2
+ vpshufb xmm5,xmm12,xmm0
+ vmovdqu XMMWORD[80+rsp],xmm4
+ vpshufb xmm6,xmm13,xmm0
+ vmovdqu XMMWORD[64+rsp],xmm5
+ vpshufb xmm7,xmm14,xmm0
+ vmovdqu XMMWORD[48+rsp],xmm6
+
+ call _aesni_ctr32_6x
+
+ mov r12,QWORD[64+rbp]
+ lea rsi,[32+rsi]
+ vmovdqu xmm8,XMMWORD[r12]
+ sub r8,12
+ mov rax,0x60*2
+ vpshufb xmm8,xmm8,xmm0
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu xmm7,XMMWORD[32+rsp]
+ vmovdqu xmm0,XMMWORD[r11]
+ vmovdqu xmm3,XMMWORD[((0-32))+rsi]
+ vpunpckhqdq xmm1,xmm7,xmm7
+ vmovdqu xmm15,XMMWORD[((32-32))+rsi]
+ vmovups XMMWORD[(-96)+rdx],xmm9
+ vpshufb xmm9,xmm9,xmm0
+ vpxor xmm1,xmm1,xmm7
+ vmovups XMMWORD[(-80)+rdx],xmm10
+ vpshufb xmm10,xmm10,xmm0
+ vmovups XMMWORD[(-64)+rdx],xmm11
+ vpshufb xmm11,xmm11,xmm0
+ vmovups XMMWORD[(-48)+rdx],xmm12
+ vpshufb xmm12,xmm12,xmm0
+ vmovups XMMWORD[(-32)+rdx],xmm13
+ vpshufb xmm13,xmm13,xmm0
+ vmovups XMMWORD[(-16)+rdx],xmm14
+ vpshufb xmm14,xmm14,xmm0
+ vmovdqu XMMWORD[16+rsp],xmm9
+ vmovdqu xmm6,XMMWORD[48+rsp]
+ vmovdqu xmm0,XMMWORD[((16-32))+rsi]
+ vpunpckhqdq xmm2,xmm6,xmm6
+ vpclmulqdq xmm5,xmm7,xmm3,0x00
+ vpxor xmm2,xmm2,xmm6
+ vpclmulqdq xmm7,xmm7,xmm3,0x11
+ vpclmulqdq xmm1,xmm1,xmm15,0x00
+
+ vmovdqu xmm9,XMMWORD[64+rsp]
+ vpclmulqdq xmm4,xmm6,xmm0,0x00
+ vmovdqu xmm3,XMMWORD[((48-32))+rsi]
+ vpxor xmm4,xmm4,xmm5
+ vpunpckhqdq xmm5,xmm9,xmm9
+ vpclmulqdq xmm6,xmm6,xmm0,0x11
+ vpxor xmm5,xmm5,xmm9
+ vpxor xmm6,xmm6,xmm7
+ vpclmulqdq xmm2,xmm2,xmm15,0x10
+ vmovdqu xmm15,XMMWORD[((80-32))+rsi]
+ vpxor xmm2,xmm2,xmm1
+
+ vmovdqu xmm1,XMMWORD[80+rsp]
+ vpclmulqdq xmm7,xmm9,xmm3,0x00
+ vmovdqu xmm0,XMMWORD[((64-32))+rsi]
+ vpxor xmm7,xmm7,xmm4
+ vpunpckhqdq xmm4,xmm1,xmm1
+ vpclmulqdq xmm9,xmm9,xmm3,0x11
+ vpxor xmm4,xmm4,xmm1
+ vpxor xmm9,xmm9,xmm6
+ vpclmulqdq xmm5,xmm5,xmm15,0x00
+ vpxor xmm5,xmm5,xmm2
+
+ vmovdqu xmm2,XMMWORD[96+rsp]
+ vpclmulqdq xmm6,xmm1,xmm0,0x00
+ vmovdqu xmm3,XMMWORD[((96-32))+rsi]
+ vpxor xmm6,xmm6,xmm7
+ vpunpckhqdq xmm7,xmm2,xmm2
+ vpclmulqdq xmm1,xmm1,xmm0,0x11
+ vpxor xmm7,xmm7,xmm2
+ vpxor xmm1,xmm1,xmm9
+ vpclmulqdq xmm4,xmm4,xmm15,0x10
+ vmovdqu xmm15,XMMWORD[((128-32))+rsi]
+ vpxor xmm4,xmm4,xmm5
+
+ vpxor xmm8,xmm8,XMMWORD[112+rsp]
+ vpclmulqdq xmm5,xmm2,xmm3,0x00
+ vmovdqu xmm0,XMMWORD[((112-32))+rsi]
+ vpunpckhqdq xmm9,xmm8,xmm8
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm2,xmm2,xmm3,0x11
+ vpxor xmm9,xmm9,xmm8
+ vpxor xmm2,xmm2,xmm1
+ vpclmulqdq xmm7,xmm7,xmm15,0x00
+ vpxor xmm4,xmm7,xmm4
+
+ vpclmulqdq xmm6,xmm8,xmm0,0x00
+ vmovdqu xmm3,XMMWORD[((0-32))+rsi]
+ vpunpckhqdq xmm1,xmm14,xmm14
+ vpclmulqdq xmm8,xmm8,xmm0,0x11
+ vpxor xmm1,xmm1,xmm14
+ vpxor xmm5,xmm6,xmm5
+ vpclmulqdq xmm9,xmm9,xmm15,0x10
+ vmovdqu xmm15,XMMWORD[((32-32))+rsi]
+ vpxor xmm7,xmm8,xmm2
+ vpxor xmm6,xmm9,xmm4
+
+ vmovdqu xmm0,XMMWORD[((16-32))+rsi]
+ vpxor xmm9,xmm7,xmm5
+ vpclmulqdq xmm4,xmm14,xmm3,0x00
+ vpxor xmm6,xmm6,xmm9
+ vpunpckhqdq xmm2,xmm13,xmm13
+ vpclmulqdq xmm14,xmm14,xmm3,0x11
+ vpxor xmm2,xmm2,xmm13
+ vpslldq xmm9,xmm6,8
+ vpclmulqdq xmm1,xmm1,xmm15,0x00
+ vpxor xmm8,xmm5,xmm9
+ vpsrldq xmm6,xmm6,8
+ vpxor xmm7,xmm7,xmm6
+
+ vpclmulqdq xmm5,xmm13,xmm0,0x00
+ vmovdqu xmm3,XMMWORD[((48-32))+rsi]
+ vpxor xmm5,xmm5,xmm4
+ vpunpckhqdq xmm9,xmm12,xmm12
+ vpclmulqdq xmm13,xmm13,xmm0,0x11
+ vpxor xmm9,xmm9,xmm12
+ vpxor xmm13,xmm13,xmm14
+ vpalignr xmm14,xmm8,xmm8,8
+ vpclmulqdq xmm2,xmm2,xmm15,0x10
+ vmovdqu xmm15,XMMWORD[((80-32))+rsi]
+ vpxor xmm2,xmm2,xmm1
+
+ vpclmulqdq xmm4,xmm12,xmm3,0x00
+ vmovdqu xmm0,XMMWORD[((64-32))+rsi]
+ vpxor xmm4,xmm4,xmm5
+ vpunpckhqdq xmm1,xmm11,xmm11
+ vpclmulqdq xmm12,xmm12,xmm3,0x11
+ vpxor xmm1,xmm1,xmm11
+ vpxor xmm12,xmm12,xmm13
+ vxorps xmm7,xmm7,XMMWORD[16+rsp]
+ vpclmulqdq xmm9,xmm9,xmm15,0x00
+ vpxor xmm9,xmm9,xmm2
+
+ vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10
+ vxorps xmm8,xmm8,xmm14
+
+ vpclmulqdq xmm5,xmm11,xmm0,0x00
+ vmovdqu xmm3,XMMWORD[((96-32))+rsi]
+ vpxor xmm5,xmm5,xmm4
+ vpunpckhqdq xmm2,xmm10,xmm10
+ vpclmulqdq xmm11,xmm11,xmm0,0x11
+ vpxor xmm2,xmm2,xmm10
+ vpalignr xmm14,xmm8,xmm8,8
+ vpxor xmm11,xmm11,xmm12
+ vpclmulqdq xmm1,xmm1,xmm15,0x10
+ vmovdqu xmm15,XMMWORD[((128-32))+rsi]
+ vpxor xmm1,xmm1,xmm9
+
+ vxorps xmm14,xmm14,xmm7
+ vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10
+ vxorps xmm8,xmm8,xmm14
+
+ vpclmulqdq xmm4,xmm10,xmm3,0x00
+ vmovdqu xmm0,XMMWORD[((112-32))+rsi]
+ vpxor xmm4,xmm4,xmm5
+ vpunpckhqdq xmm9,xmm8,xmm8
+ vpclmulqdq xmm10,xmm10,xmm3,0x11
+ vpxor xmm9,xmm9,xmm8
+ vpxor xmm10,xmm10,xmm11
+ vpclmulqdq xmm2,xmm2,xmm15,0x00
+ vpxor xmm2,xmm2,xmm1
+
+ vpclmulqdq xmm5,xmm8,xmm0,0x00
+ vpclmulqdq xmm7,xmm8,xmm0,0x11
+ vpxor xmm5,xmm5,xmm4
+ vpclmulqdq xmm6,xmm9,xmm15,0x10
+ vpxor xmm7,xmm7,xmm10
+ vpxor xmm6,xmm6,xmm2
+
+ vpxor xmm4,xmm7,xmm5
+ vpxor xmm6,xmm6,xmm4
+ vpslldq xmm1,xmm6,8
+ vmovdqu xmm3,XMMWORD[16+r11]
+ vpsrldq xmm6,xmm6,8
+ vpxor xmm8,xmm5,xmm1
+ vpxor xmm7,xmm7,xmm6
+
+ vpalignr xmm2,xmm8,xmm8,8
+ vpclmulqdq xmm8,xmm8,xmm3,0x10
+ vpxor xmm8,xmm8,xmm2
+
+ vpalignr xmm2,xmm8,xmm8,8
+ vpclmulqdq xmm8,xmm8,xmm3,0x10
+ vpxor xmm2,xmm2,xmm7
+ vpxor xmm8,xmm8,xmm2
+ mov r12,QWORD[64+rbp]
+ vpshufb xmm8,xmm8,XMMWORD[r11]
+ vmovdqu XMMWORD[r12],xmm8
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((-208))+rbp]
+ movaps xmm7,XMMWORD[((-192))+rbp]
+ movaps xmm8,XMMWORD[((-176))+rbp]
+ movaps xmm9,XMMWORD[((-160))+rbp]
+ movaps xmm10,XMMWORD[((-144))+rbp]
+ movaps xmm11,XMMWORD[((-128))+rbp]
+ movaps xmm12,XMMWORD[((-112))+rbp]
+ movaps xmm13,XMMWORD[((-96))+rbp]
+ movaps xmm14,XMMWORD[((-80))+rbp]
+ movaps xmm15,XMMWORD[((-64))+rbp]
+ mov rdi,QWORD[16+rbp]
+ mov rsi,QWORD[24+rbp]
+ lea rsp,[((-40))+rbp]
+
+ pop r15
+
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ pop rbx
+
+ pop rbp
+
+$L$gcm_enc_abort:
+ ret
+$L$SEH_end_aesni_gcm_encrypt_22:
+
+
+section .rdata rdata align=8
+ALIGN 64
+$L$bswap_mask:
+ DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$poly:
+ DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$one_msb:
+ DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$two_lsb:
+ DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+$L$one_lsb:
+ DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
+ DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
+ DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+ DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ALIGN 64
+section .text
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_aesni_gcm_decrypt_1 wrt ..imagebase
+ DD $L$SEH_end_aesni_gcm_decrypt_22 wrt ..imagebase
+ DD $L$SEH_info_aesni_gcm_decrypt_0 wrt ..imagebase
+
+ DD $L$SEH_begin_aesni_gcm_encrypt_1 wrt ..imagebase
+ DD $L$SEH_end_aesni_gcm_encrypt_22 wrt ..imagebase
+ DD $L$SEH_info_aesni_gcm_encrypt_0 wrt ..imagebase
+
+
+section .xdata rdata align=8
+ALIGN 4
+$L$SEH_info_aesni_gcm_decrypt_0:
+ DB 1
+ DB $L$SEH_prolog_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 33
+ DB 213
+ DB $L$SEH_prolog_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 248
+ DW 9
+ DB $L$SEH_prolog_aesni_gcm_decrypt_20-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 232
+ DW 8
+ DB $L$SEH_prolog_aesni_gcm_decrypt_19-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 216
+ DW 7
+ DB $L$SEH_prolog_aesni_gcm_decrypt_18-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 200
+ DW 6
+ DB $L$SEH_prolog_aesni_gcm_decrypt_17-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 184
+ DW 5
+ DB $L$SEH_prolog_aesni_gcm_decrypt_16-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 168
+ DW 4
+ DB $L$SEH_prolog_aesni_gcm_decrypt_15-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 152
+ DW 3
+ DB $L$SEH_prolog_aesni_gcm_decrypt_14-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 136
+ DW 2
+ DB $L$SEH_prolog_aesni_gcm_decrypt_13-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 120
+ DW 1
+ DB $L$SEH_prolog_aesni_gcm_decrypt_12-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_aesni_gcm_decrypt_11-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 100
+ DW 29
+ DB $L$SEH_prolog_aesni_gcm_decrypt_10-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 116
+ DW 28
+ DB $L$SEH_prolog_aesni_gcm_decrypt_9-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 3
+ DB $L$SEH_prolog_aesni_gcm_decrypt_8-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 1
+ DW 21
+ DB $L$SEH_prolog_aesni_gcm_decrypt_7-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 240
+ DB $L$SEH_prolog_aesni_gcm_decrypt_6-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 224
+ DB $L$SEH_prolog_aesni_gcm_decrypt_5-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 208
+ DB $L$SEH_prolog_aesni_gcm_decrypt_4-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 192
+ DB $L$SEH_prolog_aesni_gcm_decrypt_3-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 48
+ DB $L$SEH_prolog_aesni_gcm_decrypt_2-$L$SEH_begin_aesni_gcm_decrypt_1
+ DB 80
+
+$L$SEH_info_aesni_gcm_encrypt_0:
+ DB 1
+ DB $L$SEH_prolog_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 33
+ DB 213
+ DB $L$SEH_prolog_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 248
+ DW 9
+ DB $L$SEH_prolog_aesni_gcm_encrypt_20-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 232
+ DW 8
+ DB $L$SEH_prolog_aesni_gcm_encrypt_19-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 216
+ DW 7
+ DB $L$SEH_prolog_aesni_gcm_encrypt_18-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 200
+ DW 6
+ DB $L$SEH_prolog_aesni_gcm_encrypt_17-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 184
+ DW 5
+ DB $L$SEH_prolog_aesni_gcm_encrypt_16-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 168
+ DW 4
+ DB $L$SEH_prolog_aesni_gcm_encrypt_15-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 152
+ DW 3
+ DB $L$SEH_prolog_aesni_gcm_encrypt_14-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 136
+ DW 2
+ DB $L$SEH_prolog_aesni_gcm_encrypt_13-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 120
+ DW 1
+ DB $L$SEH_prolog_aesni_gcm_encrypt_12-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_aesni_gcm_encrypt_11-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 100
+ DW 29
+ DB $L$SEH_prolog_aesni_gcm_encrypt_10-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 116
+ DW 28
+ DB $L$SEH_prolog_aesni_gcm_encrypt_9-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 3
+ DB $L$SEH_prolog_aesni_gcm_encrypt_8-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 1
+ DW 21
+ DB $L$SEH_prolog_aesni_gcm_encrypt_7-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 240
+ DB $L$SEH_prolog_aesni_gcm_encrypt_6-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 224
+ DB $L$SEH_prolog_aesni_gcm_encrypt_5-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 208
+ DB $L$SEH_prolog_aesni_gcm_encrypt_4-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 192
+ DB $L$SEH_prolog_aesni_gcm_encrypt_3-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 48
+ DB $L$SEH_prolog_aesni_gcm_encrypt_2-$L$SEH_begin_aesni_gcm_encrypt_1
+ DB 80
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/aesni-x86-apple.S b/gen/bcm/aesni-x86-apple.S
new file mode 100644
index 0000000..4467604
--- /dev/null
+++ b/gen/bcm/aesni-x86-apple.S
@@ -0,0 +1,2475 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.globl _aes_hw_encrypt
+.private_extern _aes_hw_encrypt
+.align 4
+_aes_hw_encrypt:
+L_aes_hw_encrypt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call L000pic
+L000pic:
+ popl %ebx
+ leal _BORINGSSL_function_hit+1-L000pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L001enc1_loop_1:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L001enc1_loop_1
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.globl _aes_hw_decrypt
+.private_extern _aes_hw_decrypt
+.align 4
+_aes_hw_decrypt:
+L_aes_hw_decrypt_begin:
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L002dec1_loop_2:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L002dec1_loop_2
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.private_extern __aesni_encrypt2
+.align 4
+__aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L003enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L003enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.private_extern __aesni_decrypt2
+.align 4
+__aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L004dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L004dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.private_extern __aesni_encrypt3
+.align 4
+__aesni_encrypt3:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L005enc3_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L005enc3_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ ret
+.private_extern __aesni_decrypt3
+.align 4
+__aesni_decrypt3:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+L006dec3_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L006dec3_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ ret
+.private_extern __aesni_encrypt4
+.align 4
+__aesni_encrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shll $4,%ecx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L007enc4_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L007enc4_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ ret
+.private_extern __aesni_decrypt4
+.align 4
+__aesni_decrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shll $4,%ecx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+L008dec4_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L008dec4_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ ret
+.private_extern __aesni_encrypt6
+.align 4
+__aesni_encrypt6:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp L009_aesni_encrypt6_inner
+.align 4,0x90
+L010enc6_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+L009_aesni_encrypt6_inner:
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+L_aesni_encrypt6_enter:
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L010enc6_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ ret
+.private_extern __aesni_decrypt6
+.align 4
+__aesni_decrypt6:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp L011_aesni_decrypt6_inner
+.align 4,0x90
+L012dec6_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+L011_aesni_decrypt6_inner:
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+L_aesni_decrypt6_enter:
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L012dec6_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ ret
+.globl _aes_hw_ecb_encrypt
+.private_extern _aes_hw_ecb_encrypt
+.align 4
+_aes_hw_ecb_encrypt:
+L_aes_hw_ecb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ andl $-16,%eax
+ jz L013ecb_ret
+ movl 240(%edx),%ecx
+ testl %ebx,%ebx
+ jz L014ecb_decrypt
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb L015ecb_enc_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp L016ecb_enc_loop6_enter
+.align 4,0x90
+L017ecb_enc_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+L016ecb_enc_loop6_enter:
+ call __aesni_encrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc L017ecb_enc_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz L013ecb_ret
+L015ecb_enc_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb L018ecb_enc_one
+ movups 16(%esi),%xmm3
+ je L019ecb_enc_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb L020ecb_enc_three
+ movups 48(%esi),%xmm5
+ je L021ecb_enc_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call __aesni_encrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L018ecb_enc_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L022enc1_loop_3:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L022enc1_loop_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L019ecb_enc_two:
+ call __aesni_encrypt2
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L020ecb_enc_three:
+ call __aesni_encrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L021ecb_enc_four:
+ call __aesni_encrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L014ecb_decrypt:
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb L023ecb_dec_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp L024ecb_dec_loop6_enter
+.align 4,0x90
+L025ecb_dec_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+L024ecb_dec_loop6_enter:
+ call __aesni_decrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc L025ecb_dec_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz L013ecb_ret
+L023ecb_dec_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb L026ecb_dec_one
+ movups 16(%esi),%xmm3
+ je L027ecb_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb L028ecb_dec_three
+ movups 48(%esi),%xmm5
+ je L029ecb_dec_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call __aesni_decrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L026ecb_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L030dec1_loop_4:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L030dec1_loop_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L027ecb_dec_two:
+ call __aesni_decrypt2
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L028ecb_dec_three:
+ call __aesni_decrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp L013ecb_ret
+.align 4,0x90
+L029ecb_dec_four:
+ call __aesni_decrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+L013ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _aes_hw_ccm64_encrypt_blocks
+.private_extern _aes_hw_ccm64_encrypt_blocks
+.align 4
+_aes_hw_ccm64_encrypt_blocks:
+L_aes_hw_ccm64_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ shll $4,%ecx
+ movl $16,%ebx
+ leal (%edx),%ebp
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
+.byte 102,15,56,0,253
+L031ccm64_enc_outer:
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups (%esi),%xmm6
+ xorps %xmm0,%xmm2
+ movups 16(%ebp),%xmm1
+ xorps %xmm6,%xmm0
+ xorps %xmm0,%xmm3
+ movups 32(%ebp),%xmm0
+L032ccm64_enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L032ccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq 16(%esp),%xmm7
+ decl %eax
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ leal 16(%esi),%esi
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+.byte 102,15,56,0,213
+ leal 16(%edi),%edi
+ jnz L031ccm64_enc_outer
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _aes_hw_ccm64_decrypt_blocks
+.private_extern _aes_hw_ccm64_decrypt_blocks
+.align 4
+_aes_hw_ccm64_decrypt_blocks:
+L_aes_hw_ccm64_decrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ movl %edx,%ebp
+ movl %ecx,%ebx
+.byte 102,15,56,0,253
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L033enc1_loop_5:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L033enc1_loop_5
+.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+ leal 16(%esi),%esi
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp L034ccm64_dec_outer
+.align 4,0x90
+L034ccm64_dec_outer:
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+ leal 16(%edi),%edi
+.byte 102,15,56,0,213
+ subl $1,%eax
+ jz L035ccm64_dec_break
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups 16(%ebp),%xmm1
+ xorps %xmm0,%xmm6
+ xorps %xmm0,%xmm2
+ xorps %xmm6,%xmm3
+ movups 32(%ebp),%xmm0
+L036ccm64_dec2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz L036ccm64_dec2_loop
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ leal 16(%esi),%esi
+ jmp L034ccm64_dec_outer
+.align 4,0x90
+L035ccm64_dec_break:
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm6
+ leal 32(%edx),%edx
+ xorps %xmm6,%xmm3
+L037enc1_loop_6:
+.byte 102,15,56,220,217
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L037enc1_loop_6
+.byte 102,15,56,221,217
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _aes_hw_ctr32_encrypt_blocks
+.private_extern _aes_hw_ctr32_encrypt_blocks
+.align 4
+_aes_hw_ctr32_encrypt_blocks:
+L_aes_hw_ctr32_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call L038pic
+L038pic:
+ popl %ebx
+ leal _BORINGSSL_function_hit+0-L038pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $88,%esp
+ andl $-16,%esp
+ movl %ebp,80(%esp)
+ cmpl $1,%eax
+ je L039ctr32_one_shortcut
+ movdqu (%ebx),%xmm7
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $6,%ecx
+ xorl %ebp,%ebp
+ movl %ecx,16(%esp)
+ movl %ecx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %ebp,28(%esp)
+.byte 102,15,58,22,251,3
+.byte 102,15,58,34,253,3
+ movl 240(%edx),%ecx
+ bswap %ebx
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqa (%esp),%xmm2
+.byte 102,15,58,34,195,0
+ leal 3(%ebx),%ebp
+.byte 102,15,58,34,205,0
+ incl %ebx
+.byte 102,15,58,34,195,1
+ incl %ebp
+.byte 102,15,58,34,205,1
+ incl %ebx
+.byte 102,15,58,34,195,2
+ incl %ebp
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
+ cmpl $6,%eax
+ jb L040ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
+ movdqa %xmm7,32(%esp)
+ movl %edx,%ebp
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl $6,%eax
+ jmp L041ctr32_loop6
+.align 4,0x90
+L041ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
+ pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
+ pxor %xmm0,%xmm3
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ call L_aesni_encrypt6_enter
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups %xmm2,(%edi)
+ movdqa 16(%esp),%xmm0
+ xorps %xmm1,%xmm4
+ movdqa 64(%esp),%xmm1
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ paddd %xmm0,%xmm1
+ paddd 48(%esp),%xmm0
+ movdqa (%esp),%xmm2
+ movups 48(%esi),%xmm3
+ movups 64(%esi),%xmm4
+ xorps %xmm3,%xmm5
+ movups 80(%esi),%xmm3
+ leal 96(%esi),%esi
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
+ xorps %xmm4,%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm3,%xmm7
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ movups %xmm6,64(%edi)
+ pshufd $192,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ pshufd $128,%xmm0,%xmm3
+ subl $6,%eax
+ jnc L041ctr32_loop6
+ addl $6,%eax
+ jz L042ctr32_ret
+ movdqu (%ebp),%xmm7
+ movl %ebp,%edx
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+L040ctr32_tail:
+ por %xmm7,%xmm2
+ cmpl $2,%eax
+ jb L043ctr32_one
+ pshufd $64,%xmm0,%xmm4
+ por %xmm7,%xmm3
+ je L044ctr32_two
+ pshufd $192,%xmm1,%xmm5
+ por %xmm7,%xmm4
+ cmpl $4,%eax
+ jb L045ctr32_three
+ pshufd $128,%xmm1,%xmm6
+ por %xmm7,%xmm5
+ je L046ctr32_four
+ por %xmm7,%xmm6
+ call __aesni_encrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm4
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm5
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp L042ctr32_ret
+.align 4,0x90
+L039ctr32_one_shortcut:
+ movups (%ebx),%xmm2
+ movl 240(%edx),%ecx
+L043ctr32_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L047enc1_loop_7:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L047enc1_loop_7
+.byte 102,15,56,221,209
+ movups (%esi),%xmm6
+ xorps %xmm2,%xmm6
+ movups %xmm6,(%edi)
+ jmp L042ctr32_ret
+.align 4,0x90
+L044ctr32_two:
+ call __aesni_encrypt2
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp L042ctr32_ret
+.align 4,0x90
+L045ctr32_three:
+ call __aesni_encrypt3
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ movups 32(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp L042ctr32_ret
+.align 4,0x90
+L046ctr32_four:
+ call __aesni_encrypt4
+ movups (%esi),%xmm6
+ movups 16(%esi),%xmm7
+ movups 32(%esi),%xmm1
+ xorps %xmm6,%xmm2
+ movups 48(%esi),%xmm0
+ xorps %xmm7,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+L042ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movl 80(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _aes_hw_xts_encrypt
+.private_extern _aes_hw_xts_encrypt
+.align 4
+_aes_hw_xts_encrypt:
+L_aes_hw_xts_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L048enc1_loop_8:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L048enc1_loop_8
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ movl 240(%edx),%ecx
+ andl $-16,%esp
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ subl $96,%eax
+ jc L049xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L050xts_enc_loop6
+.align 4,0x90
+L050xts_enc_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movl %ebx,%ecx
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor 16(%esp),%xmm3
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,220,209
+ pxor 48(%esp),%xmm5
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ call L_aesni_encrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc L050xts_enc_loop6
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+L049xts_enc_short:
+ addl $96,%eax
+ jz L051xts_enc_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb L052xts_enc_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je L053xts_enc_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb L054xts_enc_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je L055xts_enc_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call __aesni_encrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp L056xts_enc_done
+.align 4,0x90
+L052xts_enc_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L057enc1_loop_9:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L057enc1_loop_9
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp L056xts_enc_done
+.align 4,0x90
+L053xts_enc_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call __aesni_encrypt2
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp L056xts_enc_done
+.align 4,0x90
+L054xts_enc_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call __aesni_encrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp L056xts_enc_done
+.align 4,0x90
+L055xts_enc_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call __aesni_encrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp L056xts_enc_done
+.align 4,0x90
+L051xts_enc_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz L058xts_enc_ret
+ movdqa %xmm1,%xmm5
+ movl %eax,112(%esp)
+ jmp L059xts_enc_steal
+.align 4,0x90
+L056xts_enc_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz L058xts_enc_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm5
+ paddq %xmm1,%xmm1
+ pand 96(%esp),%xmm5
+ pxor %xmm1,%xmm5
+L059xts_enc_steal:
+ movzbl (%esi),%ecx
+ movzbl -16(%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,-16(%edi)
+ movb %dl,(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz L059xts_enc_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups -16(%edi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L060enc1_loop_10:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L060enc1_loop_10
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,-16(%edi)
+L058xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _aes_hw_xts_decrypt
+.private_extern _aes_hw_xts_decrypt
+.align 4
+_aes_hw_xts_decrypt:
+L_aes_hw_xts_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L061enc1_loop_11:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L061enc1_loop_11
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ andl $-16,%esp
+ xorl %ebx,%ebx
+ testl $15,%eax
+ setnz %bl
+ shll $4,%ebx
+ subl %ebx,%eax
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ subl $96,%eax
+ jc L062xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp L063xts_dec_loop6
+.align 4,0x90
+L063xts_dec_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movl %ebx,%ecx
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor 16(%esp),%xmm3
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,222,209
+ pxor 48(%esp),%xmm5
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ call L_aesni_decrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc L063xts_dec_loop6
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+L062xts_dec_short:
+ addl $96,%eax
+ jz L064xts_dec_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb L065xts_dec_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je L066xts_dec_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb L067xts_dec_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je L068xts_dec_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call __aesni_decrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp L069xts_dec_done
+.align 4,0x90
+L065xts_dec_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L070dec1_loop_12:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L070dec1_loop_12
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp L069xts_dec_done
+.align 4,0x90
+L066xts_dec_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call __aesni_decrypt2
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp L069xts_dec_done
+.align 4,0x90
+L067xts_dec_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call __aesni_decrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp L069xts_dec_done
+.align 4,0x90
+L068xts_dec_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call __aesni_decrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp L069xts_dec_done
+.align 4,0x90
+L064xts_dec_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz L071xts_dec_ret
+ movl %eax,112(%esp)
+ jmp L072xts_dec_only_one_more
+.align 4,0x90
+L069xts_dec_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz L071xts_dec_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+L072xts_dec_only_one_more:
+ pshufd $19,%xmm0,%xmm5
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm5
+ pxor %xmm1,%xmm5
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%esi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L073dec1_loop_13:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L073dec1_loop_13
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+L074xts_dec_steal:
+ movzbl 16(%esi),%ecx
+ movzbl (%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,(%edi)
+ movb %dl,16(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz L074xts_dec_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%edi),%xmm2
+ xorps %xmm6,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L075dec1_loop_14:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L075dec1_loop_14
+.byte 102,15,56,223,209
+ xorps %xmm6,%xmm2
+ movups %xmm2,(%edi)
+L071xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _aes_hw_cbc_encrypt
+.private_extern _aes_hw_cbc_encrypt
+.align 4
+_aes_hw_cbc_encrypt:
+L_aes_hw_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl %esp,%ebx
+ movl 24(%esp),%edi
+ subl $24,%ebx
+ movl 28(%esp),%eax
+ andl $-16,%ebx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebp
+ testl %eax,%eax
+ jz L076cbc_abort
+ cmpl $0,40(%esp)
+ xchgl %esp,%ebx
+ movups (%ebp),%xmm7
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ebx,16(%esp)
+ movl %ecx,%ebx
+ je L077cbc_decrypt
+ movaps %xmm7,%xmm2
+ cmpl $16,%eax
+ jb L078cbc_enc_tail
+ subl $16,%eax
+ jmp L079cbc_enc_loop
+.align 4,0x90
+L079cbc_enc_loop:
+ movups (%esi),%xmm7
+ leal 16(%esi),%esi
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm7
+ leal 32(%edx),%edx
+ xorps %xmm7,%xmm2
+L080enc1_loop_15:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L080enc1_loop_15
+.byte 102,15,56,221,209
+ movl %ebx,%ecx
+ movl %ebp,%edx
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ subl $16,%eax
+ jnc L079cbc_enc_loop
+ addl $16,%eax
+ jnz L078cbc_enc_tail
+ movaps %xmm2,%xmm7
+ pxor %xmm2,%xmm2
+ jmp L081cbc_ret
+L078cbc_enc_tail:
+ movl %eax,%ecx
+.long 2767451785
+ movl $16,%ecx
+ subl %eax,%ecx
+ xorl %eax,%eax
+.long 2868115081
+ leal -16(%edi),%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+ movl %ebp,%edx
+ jmp L079cbc_enc_loop
+.align 4,0x90
+L077cbc_decrypt:
+ cmpl $80,%eax
+ jbe L082cbc_dec_tail
+ movaps %xmm7,(%esp)
+ subl $80,%eax
+ jmp L083cbc_dec_loop6_enter
+.align 4,0x90
+L084cbc_dec_loop6:
+ movaps %xmm0,(%esp)
+ movups %xmm7,(%edi)
+ leal 16(%edi),%edi
+L083cbc_dec_loop6_enter:
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ call __aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%esi),%xmm0
+ xorps %xmm1,%xmm7
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 96(%esi),%esi
+ movups %xmm4,32(%edi)
+ movl %ebx,%ecx
+ movups %xmm5,48(%edi)
+ movl %ebp,%edx
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ subl $96,%eax
+ ja L084cbc_dec_loop6
+ movaps %xmm7,%xmm2
+ movaps %xmm0,%xmm7
+ addl $80,%eax
+ jle L085cbc_dec_clear_tail_collected
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+L082cbc_dec_tail:
+ movups (%esi),%xmm2
+ movaps %xmm2,%xmm6
+ cmpl $16,%eax
+ jbe L086cbc_dec_one
+ movups 16(%esi),%xmm3
+ movaps %xmm3,%xmm5
+ cmpl $32,%eax
+ jbe L087cbc_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $48,%eax
+ jbe L088cbc_dec_three
+ movups 48(%esi),%xmm5
+ cmpl $64,%eax
+ jbe L089cbc_dec_four
+ movups 64(%esi),%xmm6
+ movaps %xmm7,(%esp)
+ movups (%esi),%xmm2
+ xorps %xmm7,%xmm7
+ call __aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm7
+ xorps %xmm0,%xmm6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
+ leal 64(%edi),%edi
+ movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ subl $80,%eax
+ jmp L090cbc_dec_tail_collected
+.align 4,0x90
+L086cbc_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+L091dec1_loop_16:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz L091dec1_loop_16
+.byte 102,15,56,223,209
+ xorps %xmm7,%xmm2
+ movaps %xmm6,%xmm7
+ subl $16,%eax
+ jmp L090cbc_dec_tail_collected
+.align 4,0x90
+L087cbc_dec_two:
+ call __aesni_decrypt2
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ leal 16(%edi),%edi
+ movaps %xmm5,%xmm7
+ subl $32,%eax
+ jmp L090cbc_dec_tail_collected
+.align 4,0x90
+L088cbc_dec_three:
+ call __aesni_decrypt3
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm5,%xmm4
+ movups %xmm2,(%edi)
+ movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ leal 32(%edi),%edi
+ movups 32(%esi),%xmm7
+ subl $48,%eax
+ jmp L090cbc_dec_tail_collected
+.align 4,0x90
+L089cbc_dec_four:
+ call __aesni_decrypt4
+ movups 16(%esi),%xmm1
+ movups 32(%esi),%xmm0
+ xorps %xmm7,%xmm2
+ movups 48(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
+ leal 48(%edi),%edi
+ movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
+ subl $64,%eax
+ jmp L090cbc_dec_tail_collected
+.align 4,0x90
+L085cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+L090cbc_dec_tail_collected:
+ andl $15,%eax
+ jnz L092cbc_dec_tail_partial
+ movups %xmm2,(%edi)
+ pxor %xmm0,%xmm0
+ jmp L081cbc_ret
+.align 4,0x90
+L092cbc_dec_tail_partial:
+ movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
+ movl $16,%ecx
+ movl %esp,%esi
+ subl %eax,%ecx
+.long 2767451785
+ movdqa %xmm2,(%esp)
+L081cbc_ret:
+ movl 16(%esp),%esp
+ movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
+ movups %xmm7,(%ebp)
+ pxor %xmm7,%xmm7
+L076cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.private_extern __aesni_set_encrypt_key
+.align 4
+__aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
+ testl %eax,%eax
+ jz L093bad_pointer
+ testl %edx,%edx
+ jz L093bad_pointer
+ call L094pic
+L094pic:
+ popl %ebx
+ leal Lkey_const-L094pic(%ebx),%ebx
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp
+ movups (%eax),%xmm0
+ xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
+ leal 16(%edx),%edx
+ andl $268437504,%ebp
+ cmpl $256,%ecx
+ je L09514rounds
+ cmpl $192,%ecx
+ je L09612rounds
+ cmpl $128,%ecx
+ jne L097bad_keybits
+.align 4,0x90
+L09810rounds:
+ cmpl $268435456,%ebp
+ je L09910rounds_alt
+ movl $9,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,200,1
+ call L100key_128_cold
+.byte 102,15,58,223,200,2
+ call L101key_128
+.byte 102,15,58,223,200,4
+ call L101key_128
+.byte 102,15,58,223,200,8
+ call L101key_128
+.byte 102,15,58,223,200,16
+ call L101key_128
+.byte 102,15,58,223,200,32
+ call L101key_128
+.byte 102,15,58,223,200,64
+ call L101key_128
+.byte 102,15,58,223,200,128
+ call L101key_128
+.byte 102,15,58,223,200,27
+ call L101key_128
+.byte 102,15,58,223,200,54
+ call L101key_128
+ movups %xmm0,(%edx)
+ movl %ecx,80(%edx)
+ jmp L102good_key
+.align 4,0x90
+L101key_128:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+L100key_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 4,0x90
+L09910rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+L103loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz L103loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp L102good_key
+.align 4,0x90
+L09612rounds:
+ movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je L10412rounds_alt
+ movl $11,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,202,1
+ call L105key_192a_cold
+.byte 102,15,58,223,202,2
+ call L106key_192b
+.byte 102,15,58,223,202,4
+ call L107key_192a
+.byte 102,15,58,223,202,8
+ call L106key_192b
+.byte 102,15,58,223,202,16
+ call L107key_192a
+.byte 102,15,58,223,202,32
+ call L106key_192b
+.byte 102,15,58,223,202,64
+ call L107key_192a
+.byte 102,15,58,223,202,128
+ call L106key_192b
+ movups %xmm0,(%edx)
+ movl %ecx,48(%edx)
+ jmp L102good_key
+.align 4,0x90
+L107key_192a:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.align 4,0x90
+L105key_192a_cold:
+ movaps %xmm2,%xmm5
+L108key_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+.align 4,0x90
+L106key_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%edx)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%edx)
+ leal 32(%edx),%edx
+ jmp L108key_192b_warm
+.align 4,0x90
+L10412rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+L109loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz L109loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp L102good_key
+.align 4,0x90
+L09514rounds:
+ movups 16(%eax),%xmm2
+ leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je L11014rounds_alt
+ movl $13,%ecx
+ movups %xmm0,-32(%edx)
+ movups %xmm2,-16(%edx)
+.byte 102,15,58,223,202,1
+ call L111key_256a_cold
+.byte 102,15,58,223,200,1
+ call L112key_256b
+.byte 102,15,58,223,202,2
+ call L113key_256a
+.byte 102,15,58,223,200,2
+ call L112key_256b
+.byte 102,15,58,223,202,4
+ call L113key_256a
+.byte 102,15,58,223,200,4
+ call L112key_256b
+.byte 102,15,58,223,202,8
+ call L113key_256a
+.byte 102,15,58,223,200,8
+ call L112key_256b
+.byte 102,15,58,223,202,16
+ call L113key_256a
+.byte 102,15,58,223,200,16
+ call L112key_256b
+.byte 102,15,58,223,202,32
+ call L113key_256a
+.byte 102,15,58,223,200,32
+ call L112key_256b
+.byte 102,15,58,223,202,64
+ call L113key_256a
+ movups %xmm0,(%edx)
+ movl %ecx,16(%edx)
+ xorl %eax,%eax
+ jmp L102good_key
+.align 4,0x90
+L113key_256a:
+ movups %xmm2,(%edx)
+ leal 16(%edx),%edx
+L111key_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 4,0x90
+L112key_256b:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+.align 4,0x90
+L11014rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+L114loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz L115done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp L114loop_key256
+L115done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+L102good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.align 2,0x90
+L093bad_pointer:
+ movl $-1,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.align 2,0x90
+L097bad_keybits:
+ pxor %xmm0,%xmm0
+ movl $-2,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.globl _aes_hw_set_encrypt_key
+.private_extern _aes_hw_set_encrypt_key
+.align 4
+_aes_hw_set_encrypt_key:
+L_aes_hw_set_encrypt_key_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call L116pic
+L116pic:
+ popl %ebx
+ leal _BORINGSSL_function_hit+3-L116pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call __aesni_set_encrypt_key
+ ret
+.globl _aes_hw_set_decrypt_key
+.private_extern _aes_hw_set_decrypt_key
+.align 4
+_aes_hw_set_decrypt_key:
+L_aes_hw_set_decrypt_key_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call __aesni_set_encrypt_key
+ movl 12(%esp),%edx
+ shll $4,%ecx
+ testl %eax,%eax
+ jnz L117dec_key_ret
+ leal 16(%edx,%ecx,1),%eax
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+ movups %xmm0,(%eax)
+ movups %xmm1,(%edx)
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+L118dec_key_inverse:
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+ movups %xmm0,16(%eax)
+ movups %xmm1,-16(%edx)
+ cmpl %edx,%eax
+ ja L118dec_key_inverse
+ movups (%edx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorl %eax,%eax
+L117dec_key_ret:
+ ret
+.align 6,0x90
+Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte 115,108,46,111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/aesni-x86-linux.S b/gen/bcm/aesni-x86-linux.S
new file mode 100644
index 0000000..54daf18
--- /dev/null
+++ b/gen/bcm/aesni-x86-linux.S
@@ -0,0 +1,2511 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,@function
+.align 16
+aes_hw_encrypt:
+.L_aes_hw_encrypt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L000pic
+.L000pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L001enc1_loop_1:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L001enc1_loop_1
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.size aes_hw_encrypt,.-.L_aes_hw_encrypt_begin
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,@function
+.align 16
+aes_hw_decrypt:
+.L_aes_hw_decrypt_begin:
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L002dec1_loop_2:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L002dec1_loop_2
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.size aes_hw_decrypt,.-.L_aes_hw_decrypt_begin
+.hidden _aesni_encrypt2
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.hidden _aesni_decrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.size _aesni_decrypt2,.-_aesni_decrypt2
+.hidden _aesni_encrypt3
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005enc3_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005enc3_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ ret
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.hidden _aesni_decrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L006dec3_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006dec3_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ ret
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.hidden _aesni_encrypt4
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shll $4,%ecx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007enc4_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007enc4_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ ret
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.hidden _aesni_decrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shll $4,%ecx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L008dec4_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L008dec4_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ ret
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.hidden _aesni_encrypt6
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L009_aesni_encrypt6_inner
+.align 16
+.L010enc6_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.L009_aesni_encrypt6_inner:
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.L_aesni_encrypt6_enter:
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L010enc6_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ ret
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.hidden _aesni_decrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L011_aesni_decrypt6_inner
+.align 16
+.L012dec6_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.L011_aesni_decrypt6_inner:
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.L_aesni_decrypt6_enter:
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L012dec6_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ ret
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.globl aes_hw_ecb_encrypt
+.hidden aes_hw_ecb_encrypt
+.type aes_hw_ecb_encrypt,@function
+.align 16
+aes_hw_ecb_encrypt:
+.L_aes_hw_ecb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ andl $-16,%eax
+ jz .L013ecb_ret
+ movl 240(%edx),%ecx
+ testl %ebx,%ebx
+ jz .L014ecb_decrypt
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L015ecb_enc_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L016ecb_enc_loop6_enter
+.align 16
+.L017ecb_enc_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L016ecb_enc_loop6_enter:
+ call _aesni_encrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L017ecb_enc_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L013ecb_ret
+.L015ecb_enc_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L018ecb_enc_one
+ movups 16(%esi),%xmm3
+ je .L019ecb_enc_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L020ecb_enc_three
+ movups 48(%esi),%xmm5
+ je .L021ecb_enc_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L018ecb_enc_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L022enc1_loop_3:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L022enc1_loop_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L019ecb_enc_two:
+ call _aesni_encrypt2
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L020ecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L021ecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L014ecb_decrypt:
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L023ecb_dec_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L024ecb_dec_loop6_enter
+.align 16
+.L025ecb_dec_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L024ecb_dec_loop6_enter:
+ call _aesni_decrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L025ecb_dec_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L013ecb_ret
+.L023ecb_dec_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L026ecb_dec_one
+ movups 16(%esi),%xmm3
+ je .L027ecb_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L028ecb_dec_three
+ movups 48(%esi),%xmm5
+ je .L029ecb_dec_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L026ecb_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L030dec1_loop_4:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L030dec1_loop_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L027ecb_dec_two:
+ call _aesni_decrypt2
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L028ecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L029ecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L013ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin
+.globl aes_hw_ccm64_encrypt_blocks
+.hidden aes_hw_ccm64_encrypt_blocks
+.type aes_hw_ccm64_encrypt_blocks,@function
+.align 16
+aes_hw_ccm64_encrypt_blocks:
+.L_aes_hw_ccm64_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ shll $4,%ecx
+ movl $16,%ebx
+ leal (%edx),%ebp
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
+.byte 102,15,56,0,253
+.L031ccm64_enc_outer:
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups (%esi),%xmm6
+ xorps %xmm0,%xmm2
+ movups 16(%ebp),%xmm1
+ xorps %xmm6,%xmm0
+ xorps %xmm0,%xmm3
+ movups 32(%ebp),%xmm0
+.L032ccm64_enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L032ccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq 16(%esp),%xmm7
+ decl %eax
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ leal 16(%esi),%esi
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+.byte 102,15,56,0,213
+ leal 16(%edi),%edi
+ jnz .L031ccm64_enc_outer
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin
+.globl aes_hw_ccm64_decrypt_blocks
+.hidden aes_hw_ccm64_decrypt_blocks
+.type aes_hw_ccm64_decrypt_blocks,@function
+.align 16
+aes_hw_ccm64_decrypt_blocks:
+.L_aes_hw_ccm64_decrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ movl %edx,%ebp
+ movl %ecx,%ebx
+.byte 102,15,56,0,253
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L033enc1_loop_5:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L033enc1_loop_5
+.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+ leal 16(%esi),%esi
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L034ccm64_dec_outer
+.align 16
+.L034ccm64_dec_outer:
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+ leal 16(%edi),%edi
+.byte 102,15,56,0,213
+ subl $1,%eax
+ jz .L035ccm64_dec_break
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups 16(%ebp),%xmm1
+ xorps %xmm0,%xmm6
+ xorps %xmm0,%xmm2
+ xorps %xmm6,%xmm3
+ movups 32(%ebp),%xmm0
+.L036ccm64_dec2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L036ccm64_dec2_loop
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ leal 16(%esi),%esi
+ jmp .L034ccm64_dec_outer
+.align 16
+.L035ccm64_dec_break:
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm6
+ leal 32(%edx),%edx
+ xorps %xmm6,%xmm3
+.L037enc1_loop_6:
+.byte 102,15,56,220,217
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L037enc1_loop_6
+.byte 102,15,56,221,217
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,@function
+.align 16
+aes_hw_ctr32_encrypt_blocks:
+.L_aes_hw_ctr32_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L038pic
+.L038pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $88,%esp
+ andl $-16,%esp
+ movl %ebp,80(%esp)
+ cmpl $1,%eax
+ je .L039ctr32_one_shortcut
+ movdqu (%ebx),%xmm7
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $6,%ecx
+ xorl %ebp,%ebp
+ movl %ecx,16(%esp)
+ movl %ecx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %ebp,28(%esp)
+.byte 102,15,58,22,251,3
+.byte 102,15,58,34,253,3
+ movl 240(%edx),%ecx
+ bswap %ebx
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqa (%esp),%xmm2
+.byte 102,15,58,34,195,0
+ leal 3(%ebx),%ebp
+.byte 102,15,58,34,205,0
+ incl %ebx
+.byte 102,15,58,34,195,1
+ incl %ebp
+.byte 102,15,58,34,205,1
+ incl %ebx
+.byte 102,15,58,34,195,2
+ incl %ebp
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
+ cmpl $6,%eax
+ jb .L040ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
+ movdqa %xmm7,32(%esp)
+ movl %edx,%ebp
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl $6,%eax
+ jmp .L041ctr32_loop6
+.align 16
+.L041ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
+ pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
+ pxor %xmm0,%xmm3
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups %xmm2,(%edi)
+ movdqa 16(%esp),%xmm0
+ xorps %xmm1,%xmm4
+ movdqa 64(%esp),%xmm1
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ paddd %xmm0,%xmm1
+ paddd 48(%esp),%xmm0
+ movdqa (%esp),%xmm2
+ movups 48(%esi),%xmm3
+ movups 64(%esi),%xmm4
+ xorps %xmm3,%xmm5
+ movups 80(%esi),%xmm3
+ leal 96(%esi),%esi
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
+ xorps %xmm4,%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm3,%xmm7
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ movups %xmm6,64(%edi)
+ pshufd $192,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ pshufd $128,%xmm0,%xmm3
+ subl $6,%eax
+ jnc .L041ctr32_loop6
+ addl $6,%eax
+ jz .L042ctr32_ret
+ movdqu (%ebp),%xmm7
+ movl %ebp,%edx
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L040ctr32_tail:
+ por %xmm7,%xmm2
+ cmpl $2,%eax
+ jb .L043ctr32_one
+ pshufd $64,%xmm0,%xmm4
+ por %xmm7,%xmm3
+ je .L044ctr32_two
+ pshufd $192,%xmm1,%xmm5
+ por %xmm7,%xmm4
+ cmpl $4,%eax
+ jb .L045ctr32_three
+ pshufd $128,%xmm1,%xmm6
+ por %xmm7,%xmm5
+ je .L046ctr32_four
+ por %xmm7,%xmm6
+ call _aesni_encrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm4
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm5
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L039ctr32_one_shortcut:
+ movups (%ebx),%xmm2
+ movl 240(%edx),%ecx
+.L043ctr32_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L047enc1_loop_7:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L047enc1_loop_7
+.byte 102,15,56,221,209
+ movups (%esi),%xmm6
+ xorps %xmm2,%xmm6
+ movups %xmm6,(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L044ctr32_two:
+ call _aesni_encrypt2
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L045ctr32_three:
+ call _aesni_encrypt3
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ movups 32(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L046ctr32_four:
+ call _aesni_encrypt4
+ movups (%esi),%xmm6
+ movups 16(%esi),%xmm7
+ movups 32(%esi),%xmm1
+ xorps %xmm6,%xmm2
+ movups 48(%esi),%xmm0
+ xorps %xmm7,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L042ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movl 80(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin
+.globl aes_hw_xts_encrypt
+.hidden aes_hw_xts_encrypt
+.type aes_hw_xts_encrypt,@function
+.align 16
+aes_hw_xts_encrypt:
+.L_aes_hw_xts_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L048enc1_loop_8:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L048enc1_loop_8
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ movl 240(%edx),%ecx
+ andl $-16,%esp
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ subl $96,%eax
+ jc .L049xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L050xts_enc_loop6
+.align 16
+.L050xts_enc_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movl %ebx,%ecx
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor 16(%esp),%xmm3
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,220,209
+ pxor 48(%esp),%xmm5
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L050xts_enc_loop6
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L049xts_enc_short:
+ addl $96,%eax
+ jz .L051xts_enc_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L052xts_enc_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L053xts_enc_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L054xts_enc_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L055xts_enc_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_encrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L056xts_enc_done
+.align 16
+.L052xts_enc_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L057enc1_loop_9:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L057enc1_loop_9
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L053xts_enc_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call _aesni_encrypt2
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L054xts_enc_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L055xts_enc_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_encrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L051xts_enc_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L058xts_enc_ret
+ movdqa %xmm1,%xmm5
+ movl %eax,112(%esp)
+ jmp .L059xts_enc_steal
+.align 16
+.L056xts_enc_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L058xts_enc_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm5
+ paddq %xmm1,%xmm1
+ pand 96(%esp),%xmm5
+ pxor %xmm1,%xmm5
+.L059xts_enc_steal:
+ movzbl (%esi),%ecx
+ movzbl -16(%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,-16(%edi)
+ movb %dl,(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L059xts_enc_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups -16(%edi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L060enc1_loop_10:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L060enc1_loop_10
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,-16(%edi)
+.L058xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin
+.globl aes_hw_xts_decrypt
+.hidden aes_hw_xts_decrypt
+.type aes_hw_xts_decrypt,@function
+.align 16
+aes_hw_xts_decrypt:
+.L_aes_hw_xts_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L061enc1_loop_11:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L061enc1_loop_11
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ andl $-16,%esp
+ xorl %ebx,%ebx
+ testl $15,%eax
+ setnz %bl
+ shll $4,%ebx
+ subl %ebx,%eax
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ subl $96,%eax
+ jc .L062xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L063xts_dec_loop6
+.align 16
+.L063xts_dec_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movl %ebx,%ecx
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor 16(%esp),%xmm3
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,222,209
+ pxor 48(%esp),%xmm5
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ call .L_aesni_decrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L063xts_dec_loop6
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L062xts_dec_short:
+ addl $96,%eax
+ jz .L064xts_dec_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L065xts_dec_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L066xts_dec_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L067xts_dec_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L068xts_dec_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_decrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L069xts_dec_done
+.align 16
+.L065xts_dec_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L070dec1_loop_12:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L070dec1_loop_12
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L066xts_dec_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call _aesni_decrypt2
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L067xts_dec_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L068xts_dec_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_decrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L064xts_dec_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L071xts_dec_ret
+ movl %eax,112(%esp)
+ jmp .L072xts_dec_only_one_more
+.align 16
+.L069xts_dec_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L071xts_dec_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+.L072xts_dec_only_one_more:
+ pshufd $19,%xmm0,%xmm5
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm5
+ pxor %xmm1,%xmm5
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%esi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L073dec1_loop_13:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L073dec1_loop_13
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+.L074xts_dec_steal:
+ movzbl 16(%esi),%ecx
+ movzbl (%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,(%edi)
+ movb %dl,16(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L074xts_dec_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%edi),%xmm2
+ xorps %xmm6,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L075dec1_loop_14:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L075dec1_loop_14
+.byte 102,15,56,223,209
+ xorps %xmm6,%xmm2
+ movups %xmm2,(%edi)
+.L071xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,@function
+.align 16
+aes_hw_cbc_encrypt:
+.L_aes_hw_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl %esp,%ebx
+ movl 24(%esp),%edi
+ subl $24,%ebx
+ movl 28(%esp),%eax
+ andl $-16,%ebx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebp
+ testl %eax,%eax
+ jz .L076cbc_abort
+ cmpl $0,40(%esp)
+ xchgl %esp,%ebx
+ movups (%ebp),%xmm7
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ebx,16(%esp)
+ movl %ecx,%ebx
+ je .L077cbc_decrypt
+ movaps %xmm7,%xmm2
+ cmpl $16,%eax
+ jb .L078cbc_enc_tail
+ subl $16,%eax
+ jmp .L079cbc_enc_loop
+.align 16
+.L079cbc_enc_loop:
+ movups (%esi),%xmm7
+ leal 16(%esi),%esi
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm7
+ leal 32(%edx),%edx
+ xorps %xmm7,%xmm2
+.L080enc1_loop_15:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L080enc1_loop_15
+.byte 102,15,56,221,209
+ movl %ebx,%ecx
+ movl %ebp,%edx
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ subl $16,%eax
+ jnc .L079cbc_enc_loop
+ addl $16,%eax
+ jnz .L078cbc_enc_tail
+ movaps %xmm2,%xmm7
+ pxor %xmm2,%xmm2
+ jmp .L081cbc_ret
+.L078cbc_enc_tail:
+ movl %eax,%ecx
+.long 2767451785
+ movl $16,%ecx
+ subl %eax,%ecx
+ xorl %eax,%eax
+.long 2868115081
+ leal -16(%edi),%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+ movl %ebp,%edx
+ jmp .L079cbc_enc_loop
+.align 16
+.L077cbc_decrypt:
+ cmpl $80,%eax
+ jbe .L082cbc_dec_tail
+ movaps %xmm7,(%esp)
+ subl $80,%eax
+ jmp .L083cbc_dec_loop6_enter
+.align 16
+.L084cbc_dec_loop6:
+ movaps %xmm0,(%esp)
+ movups %xmm7,(%edi)
+ leal 16(%edi),%edi
+.L083cbc_dec_loop6_enter:
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%esi),%xmm0
+ xorps %xmm1,%xmm7
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 96(%esi),%esi
+ movups %xmm4,32(%edi)
+ movl %ebx,%ecx
+ movups %xmm5,48(%edi)
+ movl %ebp,%edx
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ subl $96,%eax
+ ja .L084cbc_dec_loop6
+ movaps %xmm7,%xmm2
+ movaps %xmm0,%xmm7
+ addl $80,%eax
+ jle .L085cbc_dec_clear_tail_collected
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+.L082cbc_dec_tail:
+ movups (%esi),%xmm2
+ movaps %xmm2,%xmm6
+ cmpl $16,%eax
+ jbe .L086cbc_dec_one
+ movups 16(%esi),%xmm3
+ movaps %xmm3,%xmm5
+ cmpl $32,%eax
+ jbe .L087cbc_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $48,%eax
+ jbe .L088cbc_dec_three
+ movups 48(%esi),%xmm5
+ cmpl $64,%eax
+ jbe .L089cbc_dec_four
+ movups 64(%esi),%xmm6
+ movaps %xmm7,(%esp)
+ movups (%esi),%xmm2
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm7
+ xorps %xmm0,%xmm6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
+ leal 64(%edi),%edi
+ movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ subl $80,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L086cbc_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L091dec1_loop_16:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L091dec1_loop_16
+.byte 102,15,56,223,209
+ xorps %xmm7,%xmm2
+ movaps %xmm6,%xmm7
+ subl $16,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L087cbc_dec_two:
+ call _aesni_decrypt2
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ leal 16(%edi),%edi
+ movaps %xmm5,%xmm7
+ subl $32,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L088cbc_dec_three:
+ call _aesni_decrypt3
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm5,%xmm4
+ movups %xmm2,(%edi)
+ movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ leal 32(%edi),%edi
+ movups 32(%esi),%xmm7
+ subl $48,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L089cbc_dec_four:
+ call _aesni_decrypt4
+ movups 16(%esi),%xmm1
+ movups 32(%esi),%xmm0
+ xorps %xmm7,%xmm2
+ movups 48(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
+ leal 48(%edi),%edi
+ movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
+ subl $64,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L085cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L090cbc_dec_tail_collected:
+ andl $15,%eax
+ jnz .L092cbc_dec_tail_partial
+ movups %xmm2,(%edi)
+ pxor %xmm0,%xmm0
+ jmp .L081cbc_ret
+.align 16
+.L092cbc_dec_tail_partial:
+ movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
+ movl $16,%ecx
+ movl %esp,%esi
+ subl %eax,%ecx
+.long 2767451785
+ movdqa %xmm2,(%esp)
+.L081cbc_ret:
+ movl 16(%esp),%esp
+ movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
+ movups %xmm7,(%ebp)
+ pxor %xmm7,%xmm7
+.L076cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin
+.hidden _aesni_set_encrypt_key
+.type _aesni_set_encrypt_key,@function
+.align 16
+_aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
+ testl %eax,%eax
+ jz .L093bad_pointer
+ testl %edx,%edx
+ jz .L093bad_pointer
+ call .L094pic
+.L094pic:
+ popl %ebx
+ leal .Lkey_const-.L094pic(%ebx),%ebx
+ leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
+ movups (%eax),%xmm0
+ xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
+ leal 16(%edx),%edx
+ andl $268437504,%ebp
+ cmpl $256,%ecx
+ je .L09514rounds
+ cmpl $192,%ecx
+ je .L09612rounds
+ cmpl $128,%ecx
+ jne .L097bad_keybits
+.align 16
+.L09810rounds:
+ cmpl $268435456,%ebp
+ je .L09910rounds_alt
+ movl $9,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,200,1
+ call .L100key_128_cold
+.byte 102,15,58,223,200,2
+ call .L101key_128
+.byte 102,15,58,223,200,4
+ call .L101key_128
+.byte 102,15,58,223,200,8
+ call .L101key_128
+.byte 102,15,58,223,200,16
+ call .L101key_128
+.byte 102,15,58,223,200,32
+ call .L101key_128
+.byte 102,15,58,223,200,64
+ call .L101key_128
+.byte 102,15,58,223,200,128
+ call .L101key_128
+.byte 102,15,58,223,200,27
+ call .L101key_128
+.byte 102,15,58,223,200,54
+ call .L101key_128
+ movups %xmm0,(%edx)
+ movl %ecx,80(%edx)
+ jmp .L102good_key
+.align 16
+.L101key_128:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.L100key_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L09910rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L103loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L103loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L102good_key
+.align 16
+.L09612rounds:
+ movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10412rounds_alt
+ movl $11,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L105key_192a_cold
+.byte 102,15,58,223,202,2
+ call .L106key_192b
+.byte 102,15,58,223,202,4
+ call .L107key_192a
+.byte 102,15,58,223,202,8
+ call .L106key_192b
+.byte 102,15,58,223,202,16
+ call .L107key_192a
+.byte 102,15,58,223,202,32
+ call .L106key_192b
+.byte 102,15,58,223,202,64
+ call .L107key_192a
+.byte 102,15,58,223,202,128
+ call .L106key_192b
+ movups %xmm0,(%edx)
+ movl %ecx,48(%edx)
+ jmp .L102good_key
+.align 16
+.L107key_192a:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.align 16
+.L105key_192a_cold:
+ movaps %xmm2,%xmm5
+.L108key_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+.align 16
+.L106key_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%edx)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%edx)
+ leal 32(%edx),%edx
+ jmp .L108key_192b_warm
+.align 16
+.L10412rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L109loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L109loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L102good_key
+.align 16
+.L09514rounds:
+ movups 16(%eax),%xmm2
+ leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L11014rounds_alt
+ movl $13,%ecx
+ movups %xmm0,-32(%edx)
+ movups %xmm2,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L111key_256a_cold
+.byte 102,15,58,223,200,1
+ call .L112key_256b
+.byte 102,15,58,223,202,2
+ call .L113key_256a
+.byte 102,15,58,223,200,2
+ call .L112key_256b
+.byte 102,15,58,223,202,4
+ call .L113key_256a
+.byte 102,15,58,223,200,4
+ call .L112key_256b
+.byte 102,15,58,223,202,8
+ call .L113key_256a
+.byte 102,15,58,223,200,8
+ call .L112key_256b
+.byte 102,15,58,223,202,16
+ call .L113key_256a
+.byte 102,15,58,223,200,16
+ call .L112key_256b
+.byte 102,15,58,223,202,32
+ call .L113key_256a
+.byte 102,15,58,223,200,32
+ call .L112key_256b
+.byte 102,15,58,223,202,64
+ call .L113key_256a
+ movups %xmm0,(%edx)
+ movl %ecx,16(%edx)
+ xorl %eax,%eax
+ jmp .L102good_key
+.align 16
+.L113key_256a:
+ movups %xmm2,(%edx)
+ leal 16(%edx),%edx
+.L111key_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L112key_256b:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+.align 16
+.L11014rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L114loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L115done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L114loop_key256
+.L115done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L102good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.align 4
+.L093bad_pointer:
+ movl $-1,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.align 4
+.L097bad_keybits:
+ pxor %xmm0,%xmm0
+ movl $-2,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,@function
+.align 16
+aes_hw_set_encrypt_key:
+.L_aes_hw_set_encrypt_key_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L116pic
+.L116pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ ret
+.size aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,@function
+.align 16
+aes_hw_set_decrypt_key:
+.L_aes_hw_set_decrypt_key_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ movl 12(%esp),%edx
+ shll $4,%ecx
+ testl %eax,%eax
+ jnz .L117dec_key_ret
+ leal 16(%edx,%ecx,1),%eax
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+ movups %xmm0,(%eax)
+ movups %xmm1,(%edx)
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+.L118dec_key_inverse:
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+ movups %xmm0,16(%eax)
+ movups %xmm1,-16(%edx)
+ cmpl %edx,%eax
+ ja .L118dec_key_inverse
+ movups (%edx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorl %eax,%eax
+.L117dec_key_ret:
+ ret
+.size aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte 115,108,46,111,114,103,62,0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/aesni-x86-win.asm b/gen/bcm/aesni-x86-win.asm
new file mode 100644
index 0000000..19b1d98
--- /dev/null
+++ b/gen/bcm/aesni-x86-win.asm
@@ -0,0 +1,2466 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+;extern _OPENSSL_ia32cap_P
+%ifdef BORINGSSL_DISPATCH_TEST
+extern _BORINGSSL_function_hit
+%endif
+global _aes_hw_encrypt
+align 16
+_aes_hw_encrypt:
+L$_aes_hw_encrypt_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+ push ebx
+ push edx
+ call L$000pic
+L$000pic:
+ pop ebx
+ lea ebx,[(_BORINGSSL_function_hit+1-L$000pic)+ebx]
+ mov edx,1
+ mov BYTE [ebx],dl
+ pop edx
+ pop ebx
+%endif
+ mov eax,DWORD [4+esp]
+ mov edx,DWORD [12+esp]
+ movups xmm2,[eax]
+ mov ecx,DWORD [240+edx]
+ mov eax,DWORD [8+esp]
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$001enc1_loop_1:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$001enc1_loop_1
+db 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movups [eax],xmm2
+ pxor xmm2,xmm2
+ ret
+global _aes_hw_decrypt
+align 16
+_aes_hw_decrypt:
+L$_aes_hw_decrypt_begin:
+ mov eax,DWORD [4+esp]
+ mov edx,DWORD [12+esp]
+ movups xmm2,[eax]
+ mov ecx,DWORD [240+edx]
+ mov eax,DWORD [8+esp]
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$002dec1_loop_2:
+db 102,15,56,222,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$002dec1_loop_2
+db 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movups [eax],xmm2
+ pxor xmm2,xmm2
+ ret
+align 16
+__aesni_encrypt2:
+ movups xmm0,[edx]
+ shl ecx,4
+ movups xmm1,[16+edx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ movups xmm0,[32+edx]
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+ add ecx,16
+L$003enc2_loop:
+db 102,15,56,220,209
+db 102,15,56,220,217
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,220,208
+db 102,15,56,220,216
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$003enc2_loop
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,221,208
+db 102,15,56,221,216
+ ret
+align 16
+__aesni_decrypt2:
+ movups xmm0,[edx]
+ shl ecx,4
+ movups xmm1,[16+edx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ movups xmm0,[32+edx]
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+ add ecx,16
+L$004dec2_loop:
+db 102,15,56,222,209
+db 102,15,56,222,217
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,222,208
+db 102,15,56,222,216
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$004dec2_loop
+db 102,15,56,222,209
+db 102,15,56,222,217
+db 102,15,56,223,208
+db 102,15,56,223,216
+ ret
+align 16
+__aesni_encrypt3:
+ movups xmm0,[edx]
+ shl ecx,4
+ movups xmm1,[16+edx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+ movups xmm0,[32+edx]
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+ add ecx,16
+L$005enc3_loop:
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,220,225
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,220,208
+db 102,15,56,220,216
+db 102,15,56,220,224
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$005enc3_loop
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,220,225
+db 102,15,56,221,208
+db 102,15,56,221,216
+db 102,15,56,221,224
+ ret
+align 16
+__aesni_decrypt3:
+ movups xmm0,[edx]
+ shl ecx,4
+ movups xmm1,[16+edx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+ movups xmm0,[32+edx]
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+ add ecx,16
+L$006dec3_loop:
+db 102,15,56,222,209
+db 102,15,56,222,217
+db 102,15,56,222,225
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,222,208
+db 102,15,56,222,216
+db 102,15,56,222,224
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$006dec3_loop
+db 102,15,56,222,209
+db 102,15,56,222,217
+db 102,15,56,222,225
+db 102,15,56,223,208
+db 102,15,56,223,216
+db 102,15,56,223,224
+ ret
+align 16
+__aesni_encrypt4:
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ shl ecx,4
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+ pxor xmm5,xmm0
+ movups xmm0,[32+edx]
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+db 15,31,64,0
+ add ecx,16
+L$007enc4_loop:
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,220,225
+db 102,15,56,220,233
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,220,208
+db 102,15,56,220,216
+db 102,15,56,220,224
+db 102,15,56,220,232
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$007enc4_loop
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,220,225
+db 102,15,56,220,233
+db 102,15,56,221,208
+db 102,15,56,221,216
+db 102,15,56,221,224
+db 102,15,56,221,232
+ ret
+align 16
+__aesni_decrypt4:
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ shl ecx,4
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+ pxor xmm5,xmm0
+ movups xmm0,[32+edx]
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+db 15,31,64,0
+ add ecx,16
+L$008dec4_loop:
+db 102,15,56,222,209
+db 102,15,56,222,217
+db 102,15,56,222,225
+db 102,15,56,222,233
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,222,208
+db 102,15,56,222,216
+db 102,15,56,222,224
+db 102,15,56,222,232
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$008dec4_loop
+db 102,15,56,222,209
+db 102,15,56,222,217
+db 102,15,56,222,225
+db 102,15,56,222,233
+db 102,15,56,223,208
+db 102,15,56,223,216
+db 102,15,56,223,224
+db 102,15,56,223,232
+ ret
+align 16
+__aesni_encrypt6:
+ movups xmm0,[edx]
+ shl ecx,4
+ movups xmm1,[16+edx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+db 102,15,56,220,209
+ pxor xmm5,xmm0
+ pxor xmm6,xmm0
+db 102,15,56,220,217
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+db 102,15,56,220,225
+ pxor xmm7,xmm0
+ movups xmm0,[ecx*1+edx]
+ add ecx,16
+ jmp NEAR L$009_aesni_encrypt6_inner
+align 16
+L$010enc6_loop:
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,220,225
+L$009_aesni_encrypt6_inner:
+db 102,15,56,220,233
+db 102,15,56,220,241
+db 102,15,56,220,249
+L$_aesni_encrypt6_enter:
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,220,208
+db 102,15,56,220,216
+db 102,15,56,220,224
+db 102,15,56,220,232
+db 102,15,56,220,240
+db 102,15,56,220,248
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$010enc6_loop
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,220,225
+db 102,15,56,220,233
+db 102,15,56,220,241
+db 102,15,56,220,249
+db 102,15,56,221,208
+db 102,15,56,221,216
+db 102,15,56,221,224
+db 102,15,56,221,232
+db 102,15,56,221,240
+db 102,15,56,221,248
+ ret
+align 16
+__aesni_decrypt6:
+ movups xmm0,[edx]
+ shl ecx,4
+ movups xmm1,[16+edx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+db 102,15,56,222,209
+ pxor xmm5,xmm0
+ pxor xmm6,xmm0
+db 102,15,56,222,217
+ lea edx,[32+ecx*1+edx]
+ neg ecx
+db 102,15,56,222,225
+ pxor xmm7,xmm0
+ movups xmm0,[ecx*1+edx]
+ add ecx,16
+ jmp NEAR L$011_aesni_decrypt6_inner
+align 16
+L$012dec6_loop:
+db 102,15,56,222,209
+db 102,15,56,222,217
+db 102,15,56,222,225
+L$011_aesni_decrypt6_inner:
+db 102,15,56,222,233
+db 102,15,56,222,241
+db 102,15,56,222,249
+L$_aesni_decrypt6_enter:
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,222,208
+db 102,15,56,222,216
+db 102,15,56,222,224
+db 102,15,56,222,232
+db 102,15,56,222,240
+db 102,15,56,222,248
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$012dec6_loop
+db 102,15,56,222,209
+db 102,15,56,222,217
+db 102,15,56,222,225
+db 102,15,56,222,233
+db 102,15,56,222,241
+db 102,15,56,222,249
+db 102,15,56,223,208
+db 102,15,56,223,216
+db 102,15,56,223,224
+db 102,15,56,223,232
+db 102,15,56,223,240
+db 102,15,56,223,248
+ ret
+global _aes_hw_ecb_encrypt
+align 16
+_aes_hw_ecb_encrypt:
+L$_aes_hw_ecb_encrypt_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ mov ebx,DWORD [36+esp]
+ and eax,-16
+ jz NEAR L$013ecb_ret
+ mov ecx,DWORD [240+edx]
+ test ebx,ebx
+ jz NEAR L$014ecb_decrypt
+ mov ebp,edx
+ mov ebx,ecx
+ cmp eax,96
+ jb NEAR L$015ecb_enc_tail
+ movdqu xmm2,[esi]
+ movdqu xmm3,[16+esi]
+ movdqu xmm4,[32+esi]
+ movdqu xmm5,[48+esi]
+ movdqu xmm6,[64+esi]
+ movdqu xmm7,[80+esi]
+ lea esi,[96+esi]
+ sub eax,96
+ jmp NEAR L$016ecb_enc_loop6_enter
+align 16
+L$017ecb_enc_loop6:
+ movups [edi],xmm2
+ movdqu xmm2,[esi]
+ movups [16+edi],xmm3
+ movdqu xmm3,[16+esi]
+ movups [32+edi],xmm4
+ movdqu xmm4,[32+esi]
+ movups [48+edi],xmm5
+ movdqu xmm5,[48+esi]
+ movups [64+edi],xmm6
+ movdqu xmm6,[64+esi]
+ movups [80+edi],xmm7
+ lea edi,[96+edi]
+ movdqu xmm7,[80+esi]
+ lea esi,[96+esi]
+L$016ecb_enc_loop6_enter:
+ call __aesni_encrypt6
+ mov edx,ebp
+ mov ecx,ebx
+ sub eax,96
+ jnc NEAR L$017ecb_enc_loop6
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ movups [64+edi],xmm6
+ movups [80+edi],xmm7
+ lea edi,[96+edi]
+ add eax,96
+ jz NEAR L$013ecb_ret
+L$015ecb_enc_tail:
+ movups xmm2,[esi]
+ cmp eax,32
+ jb NEAR L$018ecb_enc_one
+ movups xmm3,[16+esi]
+ je NEAR L$019ecb_enc_two
+ movups xmm4,[32+esi]
+ cmp eax,64
+ jb NEAR L$020ecb_enc_three
+ movups xmm5,[48+esi]
+ je NEAR L$021ecb_enc_four
+ movups xmm6,[64+esi]
+ xorps xmm7,xmm7
+ call __aesni_encrypt6
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ movups [64+edi],xmm6
+ jmp NEAR L$013ecb_ret
+align 16
+L$018ecb_enc_one:
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$022enc1_loop_3:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$022enc1_loop_3
+db 102,15,56,221,209
+ movups [edi],xmm2
+ jmp NEAR L$013ecb_ret
+align 16
+L$019ecb_enc_two:
+ call __aesni_encrypt2
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ jmp NEAR L$013ecb_ret
+align 16
+L$020ecb_enc_three:
+ call __aesni_encrypt3
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ jmp NEAR L$013ecb_ret
+align 16
+L$021ecb_enc_four:
+ call __aesni_encrypt4
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ jmp NEAR L$013ecb_ret
+align 16
+L$014ecb_decrypt:
+ mov ebp,edx
+ mov ebx,ecx
+ cmp eax,96
+ jb NEAR L$023ecb_dec_tail
+ movdqu xmm2,[esi]
+ movdqu xmm3,[16+esi]
+ movdqu xmm4,[32+esi]
+ movdqu xmm5,[48+esi]
+ movdqu xmm6,[64+esi]
+ movdqu xmm7,[80+esi]
+ lea esi,[96+esi]
+ sub eax,96
+ jmp NEAR L$024ecb_dec_loop6_enter
+align 16
+L$025ecb_dec_loop6:
+ movups [edi],xmm2
+ movdqu xmm2,[esi]
+ movups [16+edi],xmm3
+ movdqu xmm3,[16+esi]
+ movups [32+edi],xmm4
+ movdqu xmm4,[32+esi]
+ movups [48+edi],xmm5
+ movdqu xmm5,[48+esi]
+ movups [64+edi],xmm6
+ movdqu xmm6,[64+esi]
+ movups [80+edi],xmm7
+ lea edi,[96+edi]
+ movdqu xmm7,[80+esi]
+ lea esi,[96+esi]
+L$024ecb_dec_loop6_enter:
+ call __aesni_decrypt6
+ mov edx,ebp
+ mov ecx,ebx
+ sub eax,96
+ jnc NEAR L$025ecb_dec_loop6
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ movups [64+edi],xmm6
+ movups [80+edi],xmm7
+ lea edi,[96+edi]
+ add eax,96
+ jz NEAR L$013ecb_ret
+L$023ecb_dec_tail:
+ movups xmm2,[esi]
+ cmp eax,32
+ jb NEAR L$026ecb_dec_one
+ movups xmm3,[16+esi]
+ je NEAR L$027ecb_dec_two
+ movups xmm4,[32+esi]
+ cmp eax,64
+ jb NEAR L$028ecb_dec_three
+ movups xmm5,[48+esi]
+ je NEAR L$029ecb_dec_four
+ movups xmm6,[64+esi]
+ xorps xmm7,xmm7
+ call __aesni_decrypt6
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ movups [64+edi],xmm6
+ jmp NEAR L$013ecb_ret
+align 16
+L$026ecb_dec_one:
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$030dec1_loop_4:
+db 102,15,56,222,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$030dec1_loop_4
+db 102,15,56,223,209
+ movups [edi],xmm2
+ jmp NEAR L$013ecb_ret
+align 16
+L$027ecb_dec_two:
+ call __aesni_decrypt2
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ jmp NEAR L$013ecb_ret
+align 16
+L$028ecb_dec_three:
+ call __aesni_decrypt3
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ jmp NEAR L$013ecb_ret
+align 16
+L$029ecb_dec_four:
+ call __aesni_decrypt4
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+L$013ecb_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _aes_hw_ccm64_encrypt_blocks
+align 16
+_aes_hw_ccm64_encrypt_blocks:
+L$_aes_hw_ccm64_encrypt_blocks_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ mov ebx,DWORD [36+esp]
+ mov ecx,DWORD [40+esp]
+ mov ebp,esp
+ sub esp,60
+ and esp,-16
+ mov DWORD [48+esp],ebp
+ movdqu xmm7,[ebx]
+ movdqu xmm3,[ecx]
+ mov ecx,DWORD [240+edx]
+ mov DWORD [esp],202182159
+ mov DWORD [4+esp],134810123
+ mov DWORD [8+esp],67438087
+ mov DWORD [12+esp],66051
+ mov ebx,1
+ xor ebp,ebp
+ mov DWORD [16+esp],ebx
+ mov DWORD [20+esp],ebp
+ mov DWORD [24+esp],ebp
+ mov DWORD [28+esp],ebp
+ shl ecx,4
+ mov ebx,16
+ lea ebp,[edx]
+ movdqa xmm5,[esp]
+ movdqa xmm2,xmm7
+ lea edx,[32+ecx*1+edx]
+ sub ebx,ecx
+db 102,15,56,0,253
+L$031ccm64_enc_outer:
+ movups xmm0,[ebp]
+ mov ecx,ebx
+ movups xmm6,[esi]
+ xorps xmm2,xmm0
+ movups xmm1,[16+ebp]
+ xorps xmm0,xmm6
+ xorps xmm3,xmm0
+ movups xmm0,[32+ebp]
+L$032ccm64_enc2_loop:
+db 102,15,56,220,209
+db 102,15,56,220,217
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,220,208
+db 102,15,56,220,216
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$032ccm64_enc2_loop
+db 102,15,56,220,209
+db 102,15,56,220,217
+ paddq xmm7,[16+esp]
+ dec eax
+db 102,15,56,221,208
+db 102,15,56,221,216
+ lea esi,[16+esi]
+ xorps xmm6,xmm2
+ movdqa xmm2,xmm7
+ movups [edi],xmm6
+db 102,15,56,0,213
+ lea edi,[16+edi]
+ jnz NEAR L$031ccm64_enc_outer
+ mov esp,DWORD [48+esp]
+ mov edi,DWORD [40+esp]
+ movups [edi],xmm3
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _aes_hw_ccm64_decrypt_blocks
+align 16
+_aes_hw_ccm64_decrypt_blocks:
+L$_aes_hw_ccm64_decrypt_blocks_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ mov ebx,DWORD [36+esp]
+ mov ecx,DWORD [40+esp]
+ mov ebp,esp
+ sub esp,60
+ and esp,-16
+ mov DWORD [48+esp],ebp
+ movdqu xmm7,[ebx]
+ movdqu xmm3,[ecx]
+ mov ecx,DWORD [240+edx]
+ mov DWORD [esp],202182159
+ mov DWORD [4+esp],134810123
+ mov DWORD [8+esp],67438087
+ mov DWORD [12+esp],66051
+ mov ebx,1
+ xor ebp,ebp
+ mov DWORD [16+esp],ebx
+ mov DWORD [20+esp],ebp
+ mov DWORD [24+esp],ebp
+ mov DWORD [28+esp],ebp
+ movdqa xmm5,[esp]
+ movdqa xmm2,xmm7
+ mov ebp,edx
+ mov ebx,ecx
+db 102,15,56,0,253
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$033enc1_loop_5:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$033enc1_loop_5
+db 102,15,56,221,209
+ shl ebx,4
+ mov ecx,16
+ movups xmm6,[esi]
+ paddq xmm7,[16+esp]
+ lea esi,[16+esi]
+ sub ecx,ebx
+ lea edx,[32+ebx*1+ebp]
+ mov ebx,ecx
+ jmp NEAR L$034ccm64_dec_outer
+align 16
+L$034ccm64_dec_outer:
+ xorps xmm6,xmm2
+ movdqa xmm2,xmm7
+ movups [edi],xmm6
+ lea edi,[16+edi]
+db 102,15,56,0,213
+ sub eax,1
+ jz NEAR L$035ccm64_dec_break
+ movups xmm0,[ebp]
+ mov ecx,ebx
+ movups xmm1,[16+ebp]
+ xorps xmm6,xmm0
+ xorps xmm2,xmm0
+ xorps xmm3,xmm6
+ movups xmm0,[32+ebp]
+L$036ccm64_dec2_loop:
+db 102,15,56,220,209
+db 102,15,56,220,217
+ movups xmm1,[ecx*1+edx]
+ add ecx,32
+db 102,15,56,220,208
+db 102,15,56,220,216
+ movups xmm0,[ecx*1+edx-16]
+ jnz NEAR L$036ccm64_dec2_loop
+ movups xmm6,[esi]
+ paddq xmm7,[16+esp]
+db 102,15,56,220,209
+db 102,15,56,220,217
+db 102,15,56,221,208
+db 102,15,56,221,216
+ lea esi,[16+esi]
+ jmp NEAR L$034ccm64_dec_outer
+align 16
+L$035ccm64_dec_break:
+ mov ecx,DWORD [240+ebp]
+ mov edx,ebp
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ xorps xmm6,xmm0
+ lea edx,[32+edx]
+ xorps xmm3,xmm6
+L$037enc1_loop_6:
+db 102,15,56,220,217
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$037enc1_loop_6
+db 102,15,56,221,217
+ mov esp,DWORD [48+esp]
+ mov edi,DWORD [40+esp]
+ movups [edi],xmm3
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _aes_hw_ctr32_encrypt_blocks
+align 16
+_aes_hw_ctr32_encrypt_blocks:
+L$_aes_hw_ctr32_encrypt_blocks_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+%ifdef BORINGSSL_DISPATCH_TEST
+ push ebx
+ push edx
+ call L$038pic
+L$038pic:
+ pop ebx
+ lea ebx,[(_BORINGSSL_function_hit+0-L$038pic)+ebx]
+ mov edx,1
+ mov BYTE [ebx],dl
+ pop edx
+ pop ebx
+%endif
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ mov ebx,DWORD [36+esp]
+ mov ebp,esp
+ sub esp,88
+ and esp,-16
+ mov DWORD [80+esp],ebp
+ cmp eax,1
+ je NEAR L$039ctr32_one_shortcut
+ movdqu xmm7,[ebx]
+ mov DWORD [esp],202182159
+ mov DWORD [4+esp],134810123
+ mov DWORD [8+esp],67438087
+ mov DWORD [12+esp],66051
+ mov ecx,6
+ xor ebp,ebp
+ mov DWORD [16+esp],ecx
+ mov DWORD [20+esp],ecx
+ mov DWORD [24+esp],ecx
+ mov DWORD [28+esp],ebp
+db 102,15,58,22,251,3
+db 102,15,58,34,253,3
+ mov ecx,DWORD [240+edx]
+ bswap ebx
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movdqa xmm2,[esp]
+db 102,15,58,34,195,0
+ lea ebp,[3+ebx]
+db 102,15,58,34,205,0
+ inc ebx
+db 102,15,58,34,195,1
+ inc ebp
+db 102,15,58,34,205,1
+ inc ebx
+db 102,15,58,34,195,2
+ inc ebp
+db 102,15,58,34,205,2
+ movdqa [48+esp],xmm0
+db 102,15,56,0,194
+ movdqu xmm6,[edx]
+ movdqa [64+esp],xmm1
+db 102,15,56,0,202
+ pshufd xmm2,xmm0,192
+ pshufd xmm3,xmm0,128
+ cmp eax,6
+ jb NEAR L$040ctr32_tail
+ pxor xmm7,xmm6
+ shl ecx,4
+ mov ebx,16
+ movdqa [32+esp],xmm7
+ mov ebp,edx
+ sub ebx,ecx
+ lea edx,[32+ecx*1+edx]
+ sub eax,6
+ jmp NEAR L$041ctr32_loop6
+align 16
+L$041ctr32_loop6:
+ pshufd xmm4,xmm0,64
+ movdqa xmm0,[32+esp]
+ pshufd xmm5,xmm1,192
+ pxor xmm2,xmm0
+ pshufd xmm6,xmm1,128
+ pxor xmm3,xmm0
+ pshufd xmm7,xmm1,64
+ movups xmm1,[16+ebp]
+ pxor xmm4,xmm0
+ pxor xmm5,xmm0
+db 102,15,56,220,209
+ pxor xmm6,xmm0
+ pxor xmm7,xmm0
+db 102,15,56,220,217
+ movups xmm0,[32+ebp]
+ mov ecx,ebx
+db 102,15,56,220,225
+db 102,15,56,220,233
+db 102,15,56,220,241
+db 102,15,56,220,249
+ call L$_aesni_encrypt6_enter
+ movups xmm1,[esi]
+ movups xmm0,[16+esi]
+ xorps xmm2,xmm1
+ movups xmm1,[32+esi]
+ xorps xmm3,xmm0
+ movups [edi],xmm2
+ movdqa xmm0,[16+esp]
+ xorps xmm4,xmm1
+ movdqa xmm1,[64+esp]
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ paddd xmm1,xmm0
+ paddd xmm0,[48+esp]
+ movdqa xmm2,[esp]
+ movups xmm3,[48+esi]
+ movups xmm4,[64+esi]
+ xorps xmm5,xmm3
+ movups xmm3,[80+esi]
+ lea esi,[96+esi]
+ movdqa [48+esp],xmm0
+db 102,15,56,0,194
+ xorps xmm6,xmm4
+ movups [48+edi],xmm5
+ xorps xmm7,xmm3
+ movdqa [64+esp],xmm1
+db 102,15,56,0,202
+ movups [64+edi],xmm6
+ pshufd xmm2,xmm0,192
+ movups [80+edi],xmm7
+ lea edi,[96+edi]
+ pshufd xmm3,xmm0,128
+ sub eax,6
+ jnc NEAR L$041ctr32_loop6
+ add eax,6
+ jz NEAR L$042ctr32_ret
+ movdqu xmm7,[ebp]
+ mov edx,ebp
+ pxor xmm7,[32+esp]
+ mov ecx,DWORD [240+ebp]
+L$040ctr32_tail:
+ por xmm2,xmm7
+ cmp eax,2
+ jb NEAR L$043ctr32_one
+ pshufd xmm4,xmm0,64
+ por xmm3,xmm7
+ je NEAR L$044ctr32_two
+ pshufd xmm5,xmm1,192
+ por xmm4,xmm7
+ cmp eax,4
+ jb NEAR L$045ctr32_three
+ pshufd xmm6,xmm1,128
+ por xmm5,xmm7
+ je NEAR L$046ctr32_four
+ por xmm6,xmm7
+ call __aesni_encrypt6
+ movups xmm1,[esi]
+ movups xmm0,[16+esi]
+ xorps xmm2,xmm1
+ movups xmm1,[32+esi]
+ xorps xmm3,xmm0
+ movups xmm0,[48+esi]
+ xorps xmm4,xmm1
+ movups xmm1,[64+esi]
+ xorps xmm5,xmm0
+ movups [edi],xmm2
+ xorps xmm6,xmm1
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ movups [64+edi],xmm6
+ jmp NEAR L$042ctr32_ret
+align 16
+L$039ctr32_one_shortcut:
+ movups xmm2,[ebx]
+ mov ecx,DWORD [240+edx]
+L$043ctr32_one:
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$047enc1_loop_7:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$047enc1_loop_7
+db 102,15,56,221,209
+ movups xmm6,[esi]
+ xorps xmm6,xmm2
+ movups [edi],xmm6
+ jmp NEAR L$042ctr32_ret
+align 16
+L$044ctr32_two:
+ call __aesni_encrypt2
+ movups xmm5,[esi]
+ movups xmm6,[16+esi]
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ jmp NEAR L$042ctr32_ret
+align 16
+L$045ctr32_three:
+ call __aesni_encrypt3
+ movups xmm5,[esi]
+ movups xmm6,[16+esi]
+ xorps xmm2,xmm5
+ movups xmm7,[32+esi]
+ xorps xmm3,xmm6
+ movups [edi],xmm2
+ xorps xmm4,xmm7
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ jmp NEAR L$042ctr32_ret
+align 16
+L$046ctr32_four:
+ call __aesni_encrypt4
+ movups xmm6,[esi]
+ movups xmm7,[16+esi]
+ movups xmm1,[32+esi]
+ xorps xmm2,xmm6
+ movups xmm0,[48+esi]
+ xorps xmm3,xmm7
+ movups [edi],xmm2
+ xorps xmm4,xmm1
+ movups [16+edi],xmm3
+ xorps xmm5,xmm0
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+L$042ctr32_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ movdqa [32+esp],xmm0
+ pxor xmm5,xmm5
+ movdqa [48+esp],xmm0
+ pxor xmm6,xmm6
+ movdqa [64+esp],xmm0
+ pxor xmm7,xmm7
+ mov esp,DWORD [80+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _aes_hw_xts_encrypt
+align 16
+_aes_hw_xts_encrypt:
+L$_aes_hw_xts_encrypt_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov edx,DWORD [36+esp]
+ mov esi,DWORD [40+esp]
+ mov ecx,DWORD [240+edx]
+ movups xmm2,[esi]
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$048enc1_loop_8:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$048enc1_loop_8
+db 102,15,56,221,209
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ mov ebp,esp
+ sub esp,120
+ mov ecx,DWORD [240+edx]
+ and esp,-16
+ mov DWORD [96+esp],135
+ mov DWORD [100+esp],0
+ mov DWORD [104+esp],1
+ mov DWORD [108+esp],0
+ mov DWORD [112+esp],eax
+ mov DWORD [116+esp],ebp
+ movdqa xmm1,xmm2
+ pxor xmm0,xmm0
+ movdqa xmm3,[96+esp]
+ pcmpgtd xmm0,xmm1
+ and eax,-16
+ mov ebp,edx
+ mov ebx,ecx
+ sub eax,96
+ jc NEAR L$049xts_enc_short
+ shl ecx,4
+ mov ebx,16
+ sub ebx,ecx
+ lea edx,[32+ecx*1+edx]
+ jmp NEAR L$050xts_enc_loop6
+align 16
+L$050xts_enc_loop6:
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [16+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [32+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [48+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm7,xmm0,19
+ movdqa [64+esp],xmm1
+ paddq xmm1,xmm1
+ movups xmm0,[ebp]
+ pand xmm7,xmm3
+ movups xmm2,[esi]
+ pxor xmm7,xmm1
+ mov ecx,ebx
+ movdqu xmm3,[16+esi]
+ xorps xmm2,xmm0
+ movdqu xmm4,[32+esi]
+ pxor xmm3,xmm0
+ movdqu xmm5,[48+esi]
+ pxor xmm4,xmm0
+ movdqu xmm6,[64+esi]
+ pxor xmm5,xmm0
+ movdqu xmm1,[80+esi]
+ pxor xmm6,xmm0
+ lea esi,[96+esi]
+ pxor xmm2,[esp]
+ movdqa [80+esp],xmm7
+ pxor xmm7,xmm1
+ movups xmm1,[16+ebp]
+ pxor xmm3,[16+esp]
+ pxor xmm4,[32+esp]
+db 102,15,56,220,209
+ pxor xmm5,[48+esp]
+ pxor xmm6,[64+esp]
+db 102,15,56,220,217
+ pxor xmm7,xmm0
+ movups xmm0,[32+ebp]
+db 102,15,56,220,225
+db 102,15,56,220,233
+db 102,15,56,220,241
+db 102,15,56,220,249
+ call L$_aesni_encrypt6_enter
+ movdqa xmm1,[80+esp]
+ pxor xmm0,xmm0
+ xorps xmm2,[esp]
+ pcmpgtd xmm0,xmm1
+ xorps xmm3,[16+esp]
+ movups [edi],xmm2
+ xorps xmm4,[32+esp]
+ movups [16+edi],xmm3
+ xorps xmm5,[48+esp]
+ movups [32+edi],xmm4
+ xorps xmm6,[64+esp]
+ movups [48+edi],xmm5
+ xorps xmm7,xmm1
+ movups [64+edi],xmm6
+ pshufd xmm2,xmm0,19
+ movups [80+edi],xmm7
+ lea edi,[96+edi]
+ movdqa xmm3,[96+esp]
+ pxor xmm0,xmm0
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ sub eax,96
+ jnc NEAR L$050xts_enc_loop6
+ mov ecx,DWORD [240+ebp]
+ mov edx,ebp
+ mov ebx,ecx
+L$049xts_enc_short:
+ add eax,96
+ jz NEAR L$051xts_enc_done6x
+ movdqa xmm5,xmm1
+ cmp eax,32
+ jb NEAR L$052xts_enc_one
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ je NEAR L$053xts_enc_two
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ cmp eax,64
+ jb NEAR L$054xts_enc_three
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa xmm7,xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ movdqa [esp],xmm5
+ movdqa [16+esp],xmm6
+ je NEAR L$055xts_enc_four
+ movdqa [32+esp],xmm7
+ pshufd xmm7,xmm0,19
+ movdqa [48+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm7,xmm3
+ pxor xmm7,xmm1
+ movdqu xmm2,[esi]
+ movdqu xmm3,[16+esi]
+ movdqu xmm4,[32+esi]
+ pxor xmm2,[esp]
+ movdqu xmm5,[48+esi]
+ pxor xmm3,[16+esp]
+ movdqu xmm6,[64+esi]
+ pxor xmm4,[32+esp]
+ lea esi,[80+esi]
+ pxor xmm5,[48+esp]
+ movdqa [64+esp],xmm7
+ pxor xmm6,xmm7
+ call __aesni_encrypt6
+ movaps xmm1,[64+esp]
+ xorps xmm2,[esp]
+ xorps xmm3,[16+esp]
+ xorps xmm4,[32+esp]
+ movups [edi],xmm2
+ xorps xmm5,[48+esp]
+ movups [16+edi],xmm3
+ xorps xmm6,xmm1
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ movups [64+edi],xmm6
+ lea edi,[80+edi]
+ jmp NEAR L$056xts_enc_done
+align 16
+L$052xts_enc_one:
+ movups xmm2,[esi]
+ lea esi,[16+esi]
+ xorps xmm2,xmm5
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$057enc1_loop_9:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$057enc1_loop_9
+db 102,15,56,221,209
+ xorps xmm2,xmm5
+ movups [edi],xmm2
+ lea edi,[16+edi]
+ movdqa xmm1,xmm5
+ jmp NEAR L$056xts_enc_done
+align 16
+L$053xts_enc_two:
+ movaps xmm6,xmm1
+ movups xmm2,[esi]
+ movups xmm3,[16+esi]
+ lea esi,[32+esi]
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ call __aesni_encrypt2
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ lea edi,[32+edi]
+ movdqa xmm1,xmm6
+ jmp NEAR L$056xts_enc_done
+align 16
+L$054xts_enc_three:
+ movaps xmm7,xmm1
+ movups xmm2,[esi]
+ movups xmm3,[16+esi]
+ movups xmm4,[32+esi]
+ lea esi,[48+esi]
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ xorps xmm4,xmm7
+ call __aesni_encrypt3
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ xorps xmm4,xmm7
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ lea edi,[48+edi]
+ movdqa xmm1,xmm7
+ jmp NEAR L$056xts_enc_done
+align 16
+L$055xts_enc_four:
+ movaps xmm6,xmm1
+ movups xmm2,[esi]
+ movups xmm3,[16+esi]
+ movups xmm4,[32+esi]
+ xorps xmm2,[esp]
+ movups xmm5,[48+esi]
+ lea esi,[64+esi]
+ xorps xmm3,[16+esp]
+ xorps xmm4,xmm7
+ xorps xmm5,xmm6
+ call __aesni_encrypt4
+ xorps xmm2,[esp]
+ xorps xmm3,[16+esp]
+ xorps xmm4,xmm7
+ movups [edi],xmm2
+ xorps xmm5,xmm6
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ lea edi,[64+edi]
+ movdqa xmm1,xmm6
+ jmp NEAR L$056xts_enc_done
+align 16
+L$051xts_enc_done6x:
+ mov eax,DWORD [112+esp]
+ and eax,15
+ jz NEAR L$058xts_enc_ret
+ movdqa xmm5,xmm1
+ mov DWORD [112+esp],eax
+ jmp NEAR L$059xts_enc_steal
+align 16
+L$056xts_enc_done:
+ mov eax,DWORD [112+esp]
+ pxor xmm0,xmm0
+ and eax,15
+ jz NEAR L$058xts_enc_ret
+ pcmpgtd xmm0,xmm1
+ mov DWORD [112+esp],eax
+ pshufd xmm5,xmm0,19
+ paddq xmm1,xmm1
+ pand xmm5,[96+esp]
+ pxor xmm5,xmm1
+L$059xts_enc_steal:
+ movzx ecx,BYTE [esi]
+ movzx edx,BYTE [edi-16]
+ lea esi,[1+esi]
+ mov BYTE [edi-16],cl
+ mov BYTE [edi],dl
+ lea edi,[1+edi]
+ sub eax,1
+ jnz NEAR L$059xts_enc_steal
+ sub edi,DWORD [112+esp]
+ mov edx,ebp
+ mov ecx,ebx
+ movups xmm2,[edi-16]
+ xorps xmm2,xmm5
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$060enc1_loop_10:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$060enc1_loop_10
+db 102,15,56,221,209
+ xorps xmm2,xmm5
+ movups [edi-16],xmm2
+L$058xts_enc_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ movdqa [esp],xmm0
+ pxor xmm3,xmm3
+ movdqa [16+esp],xmm0
+ pxor xmm4,xmm4
+ movdqa [32+esp],xmm0
+ pxor xmm5,xmm5
+ movdqa [48+esp],xmm0
+ pxor xmm6,xmm6
+ movdqa [64+esp],xmm0
+ pxor xmm7,xmm7
+ movdqa [80+esp],xmm0
+ mov esp,DWORD [116+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _aes_hw_xts_decrypt
+align 16
+_aes_hw_xts_decrypt:
+L$_aes_hw_xts_decrypt_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov edx,DWORD [36+esp]
+ mov esi,DWORD [40+esp]
+ mov ecx,DWORD [240+edx]
+ movups xmm2,[esi]
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$061enc1_loop_11:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$061enc1_loop_11
+db 102,15,56,221,209
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ mov ebp,esp
+ sub esp,120
+ and esp,-16
+ xor ebx,ebx
+ test eax,15
+ setnz bl
+ shl ebx,4
+ sub eax,ebx
+ mov DWORD [96+esp],135
+ mov DWORD [100+esp],0
+ mov DWORD [104+esp],1
+ mov DWORD [108+esp],0
+ mov DWORD [112+esp],eax
+ mov DWORD [116+esp],ebp
+ mov ecx,DWORD [240+edx]
+ mov ebp,edx
+ mov ebx,ecx
+ movdqa xmm1,xmm2
+ pxor xmm0,xmm0
+ movdqa xmm3,[96+esp]
+ pcmpgtd xmm0,xmm1
+ and eax,-16
+ sub eax,96
+ jc NEAR L$062xts_dec_short
+ shl ecx,4
+ mov ebx,16
+ sub ebx,ecx
+ lea edx,[32+ecx*1+edx]
+ jmp NEAR L$063xts_dec_loop6
+align 16
+L$063xts_dec_loop6:
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [16+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [32+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa [48+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ pshufd xmm7,xmm0,19
+ movdqa [64+esp],xmm1
+ paddq xmm1,xmm1
+ movups xmm0,[ebp]
+ pand xmm7,xmm3
+ movups xmm2,[esi]
+ pxor xmm7,xmm1
+ mov ecx,ebx
+ movdqu xmm3,[16+esi]
+ xorps xmm2,xmm0
+ movdqu xmm4,[32+esi]
+ pxor xmm3,xmm0
+ movdqu xmm5,[48+esi]
+ pxor xmm4,xmm0
+ movdqu xmm6,[64+esi]
+ pxor xmm5,xmm0
+ movdqu xmm1,[80+esi]
+ pxor xmm6,xmm0
+ lea esi,[96+esi]
+ pxor xmm2,[esp]
+ movdqa [80+esp],xmm7
+ pxor xmm7,xmm1
+ movups xmm1,[16+ebp]
+ pxor xmm3,[16+esp]
+ pxor xmm4,[32+esp]
+db 102,15,56,222,209
+ pxor xmm5,[48+esp]
+ pxor xmm6,[64+esp]
+db 102,15,56,222,217
+ pxor xmm7,xmm0
+ movups xmm0,[32+ebp]
+db 102,15,56,222,225
+db 102,15,56,222,233
+db 102,15,56,222,241
+db 102,15,56,222,249
+ call L$_aesni_decrypt6_enter
+ movdqa xmm1,[80+esp]
+ pxor xmm0,xmm0
+ xorps xmm2,[esp]
+ pcmpgtd xmm0,xmm1
+ xorps xmm3,[16+esp]
+ movups [edi],xmm2
+ xorps xmm4,[32+esp]
+ movups [16+edi],xmm3
+ xorps xmm5,[48+esp]
+ movups [32+edi],xmm4
+ xorps xmm6,[64+esp]
+ movups [48+edi],xmm5
+ xorps xmm7,xmm1
+ movups [64+edi],xmm6
+ pshufd xmm2,xmm0,19
+ movups [80+edi],xmm7
+ lea edi,[96+edi]
+ movdqa xmm3,[96+esp]
+ pxor xmm0,xmm0
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ sub eax,96
+ jnc NEAR L$063xts_dec_loop6
+ mov ecx,DWORD [240+ebp]
+ mov edx,ebp
+ mov ebx,ecx
+L$062xts_dec_short:
+ add eax,96
+ jz NEAR L$064xts_dec_done6x
+ movdqa xmm5,xmm1
+ cmp eax,32
+ jb NEAR L$065xts_dec_one
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ je NEAR L$066xts_dec_two
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ cmp eax,64
+ jb NEAR L$067xts_dec_three
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa xmm7,xmm1
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+ movdqa [esp],xmm5
+ movdqa [16+esp],xmm6
+ je NEAR L$068xts_dec_four
+ movdqa [32+esp],xmm7
+ pshufd xmm7,xmm0,19
+ movdqa [48+esp],xmm1
+ paddq xmm1,xmm1
+ pand xmm7,xmm3
+ pxor xmm7,xmm1
+ movdqu xmm2,[esi]
+ movdqu xmm3,[16+esi]
+ movdqu xmm4,[32+esi]
+ pxor xmm2,[esp]
+ movdqu xmm5,[48+esi]
+ pxor xmm3,[16+esp]
+ movdqu xmm6,[64+esi]
+ pxor xmm4,[32+esp]
+ lea esi,[80+esi]
+ pxor xmm5,[48+esp]
+ movdqa [64+esp],xmm7
+ pxor xmm6,xmm7
+ call __aesni_decrypt6
+ movaps xmm1,[64+esp]
+ xorps xmm2,[esp]
+ xorps xmm3,[16+esp]
+ xorps xmm4,[32+esp]
+ movups [edi],xmm2
+ xorps xmm5,[48+esp]
+ movups [16+edi],xmm3
+ xorps xmm6,xmm1
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ movups [64+edi],xmm6
+ lea edi,[80+edi]
+ jmp NEAR L$069xts_dec_done
+align 16
+L$065xts_dec_one:
+ movups xmm2,[esi]
+ lea esi,[16+esi]
+ xorps xmm2,xmm5
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$070dec1_loop_12:
+db 102,15,56,222,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$070dec1_loop_12
+db 102,15,56,223,209
+ xorps xmm2,xmm5
+ movups [edi],xmm2
+ lea edi,[16+edi]
+ movdqa xmm1,xmm5
+ jmp NEAR L$069xts_dec_done
+align 16
+L$066xts_dec_two:
+ movaps xmm6,xmm1
+ movups xmm2,[esi]
+ movups xmm3,[16+esi]
+ lea esi,[32+esi]
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ call __aesni_decrypt2
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ lea edi,[32+edi]
+ movdqa xmm1,xmm6
+ jmp NEAR L$069xts_dec_done
+align 16
+L$067xts_dec_three:
+ movaps xmm7,xmm1
+ movups xmm2,[esi]
+ movups xmm3,[16+esi]
+ movups xmm4,[32+esi]
+ lea esi,[48+esi]
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ xorps xmm4,xmm7
+ call __aesni_decrypt3
+ xorps xmm2,xmm5
+ xorps xmm3,xmm6
+ xorps xmm4,xmm7
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ lea edi,[48+edi]
+ movdqa xmm1,xmm7
+ jmp NEAR L$069xts_dec_done
+align 16
+L$068xts_dec_four:
+ movaps xmm6,xmm1
+ movups xmm2,[esi]
+ movups xmm3,[16+esi]
+ movups xmm4,[32+esi]
+ xorps xmm2,[esp]
+ movups xmm5,[48+esi]
+ lea esi,[64+esi]
+ xorps xmm3,[16+esp]
+ xorps xmm4,xmm7
+ xorps xmm5,xmm6
+ call __aesni_decrypt4
+ xorps xmm2,[esp]
+ xorps xmm3,[16+esp]
+ xorps xmm4,xmm7
+ movups [edi],xmm2
+ xorps xmm5,xmm6
+ movups [16+edi],xmm3
+ movups [32+edi],xmm4
+ movups [48+edi],xmm5
+ lea edi,[64+edi]
+ movdqa xmm1,xmm6
+ jmp NEAR L$069xts_dec_done
+align 16
+L$064xts_dec_done6x:
+ mov eax,DWORD [112+esp]
+ and eax,15
+ jz NEAR L$071xts_dec_ret
+ mov DWORD [112+esp],eax
+ jmp NEAR L$072xts_dec_only_one_more
+align 16
+L$069xts_dec_done:
+ mov eax,DWORD [112+esp]
+ pxor xmm0,xmm0
+ and eax,15
+ jz NEAR L$071xts_dec_ret
+ pcmpgtd xmm0,xmm1
+ mov DWORD [112+esp],eax
+ pshufd xmm2,xmm0,19
+ pxor xmm0,xmm0
+ movdqa xmm3,[96+esp]
+ paddq xmm1,xmm1
+ pand xmm2,xmm3
+ pcmpgtd xmm0,xmm1
+ pxor xmm1,xmm2
+L$072xts_dec_only_one_more:
+ pshufd xmm5,xmm0,19
+ movdqa xmm6,xmm1
+ paddq xmm1,xmm1
+ pand xmm5,xmm3
+ pxor xmm5,xmm1
+ mov edx,ebp
+ mov ecx,ebx
+ movups xmm2,[esi]
+ xorps xmm2,xmm5
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$073dec1_loop_13:
+db 102,15,56,222,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$073dec1_loop_13
+db 102,15,56,223,209
+ xorps xmm2,xmm5
+ movups [edi],xmm2
+L$074xts_dec_steal:
+ movzx ecx,BYTE [16+esi]
+ movzx edx,BYTE [edi]
+ lea esi,[1+esi]
+ mov BYTE [edi],cl
+ mov BYTE [16+edi],dl
+ lea edi,[1+edi]
+ sub eax,1
+ jnz NEAR L$074xts_dec_steal
+ sub edi,DWORD [112+esp]
+ mov edx,ebp
+ mov ecx,ebx
+ movups xmm2,[edi]
+ xorps xmm2,xmm6
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$075dec1_loop_14:
+db 102,15,56,222,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$075dec1_loop_14
+db 102,15,56,223,209
+ xorps xmm2,xmm6
+ movups [edi],xmm2
+L$071xts_dec_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ movdqa [esp],xmm0
+ pxor xmm3,xmm3
+ movdqa [16+esp],xmm0
+ pxor xmm4,xmm4
+ movdqa [32+esp],xmm0
+ pxor xmm5,xmm5
+ movdqa [48+esp],xmm0
+ pxor xmm6,xmm6
+ movdqa [64+esp],xmm0
+ pxor xmm7,xmm7
+ movdqa [80+esp],xmm0
+ mov esp,DWORD [116+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _aes_hw_cbc_encrypt
+align 16
+_aes_hw_cbc_encrypt:
+L$_aes_hw_cbc_encrypt_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov ebx,esp
+ mov edi,DWORD [24+esp]
+ sub ebx,24
+ mov eax,DWORD [28+esp]
+ and ebx,-16
+ mov edx,DWORD [32+esp]
+ mov ebp,DWORD [36+esp]
+ test eax,eax
+ jz NEAR L$076cbc_abort
+ cmp DWORD [40+esp],0
+ xchg ebx,esp
+ movups xmm7,[ebp]
+ mov ecx,DWORD [240+edx]
+ mov ebp,edx
+ mov DWORD [16+esp],ebx
+ mov ebx,ecx
+ je NEAR L$077cbc_decrypt
+ movaps xmm2,xmm7
+ cmp eax,16
+ jb NEAR L$078cbc_enc_tail
+ sub eax,16
+ jmp NEAR L$079cbc_enc_loop
+align 16
+L$079cbc_enc_loop:
+ movups xmm7,[esi]
+ lea esi,[16+esi]
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ xorps xmm7,xmm0
+ lea edx,[32+edx]
+ xorps xmm2,xmm7
+L$080enc1_loop_15:
+db 102,15,56,220,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$080enc1_loop_15
+db 102,15,56,221,209
+ mov ecx,ebx
+ mov edx,ebp
+ movups [edi],xmm2
+ lea edi,[16+edi]
+ sub eax,16
+ jnc NEAR L$079cbc_enc_loop
+ add eax,16
+ jnz NEAR L$078cbc_enc_tail
+ movaps xmm7,xmm2
+ pxor xmm2,xmm2
+ jmp NEAR L$081cbc_ret
+L$078cbc_enc_tail:
+ mov ecx,eax
+dd 2767451785
+ mov ecx,16
+ sub ecx,eax
+ xor eax,eax
+dd 2868115081
+ lea edi,[edi-16]
+ mov ecx,ebx
+ mov esi,edi
+ mov edx,ebp
+ jmp NEAR L$079cbc_enc_loop
+align 16
+L$077cbc_decrypt:
+ cmp eax,80
+ jbe NEAR L$082cbc_dec_tail
+ movaps [esp],xmm7
+ sub eax,80
+ jmp NEAR L$083cbc_dec_loop6_enter
+align 16
+L$084cbc_dec_loop6:
+ movaps [esp],xmm0
+ movups [edi],xmm7
+ lea edi,[16+edi]
+L$083cbc_dec_loop6_enter:
+ movdqu xmm2,[esi]
+ movdqu xmm3,[16+esi]
+ movdqu xmm4,[32+esi]
+ movdqu xmm5,[48+esi]
+ movdqu xmm6,[64+esi]
+ movdqu xmm7,[80+esi]
+ call __aesni_decrypt6
+ movups xmm1,[esi]
+ movups xmm0,[16+esi]
+ xorps xmm2,[esp]
+ xorps xmm3,xmm1
+ movups xmm1,[32+esi]
+ xorps xmm4,xmm0
+ movups xmm0,[48+esi]
+ xorps xmm5,xmm1
+ movups xmm1,[64+esi]
+ xorps xmm6,xmm0
+ movups xmm0,[80+esi]
+ xorps xmm7,xmm1
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ lea esi,[96+esi]
+ movups [32+edi],xmm4
+ mov ecx,ebx
+ movups [48+edi],xmm5
+ mov edx,ebp
+ movups [64+edi],xmm6
+ lea edi,[80+edi]
+ sub eax,96
+ ja NEAR L$084cbc_dec_loop6
+ movaps xmm2,xmm7
+ movaps xmm7,xmm0
+ add eax,80
+ jle NEAR L$085cbc_dec_clear_tail_collected
+ movups [edi],xmm2
+ lea edi,[16+edi]
+L$082cbc_dec_tail:
+ movups xmm2,[esi]
+ movaps xmm6,xmm2
+ cmp eax,16
+ jbe NEAR L$086cbc_dec_one
+ movups xmm3,[16+esi]
+ movaps xmm5,xmm3
+ cmp eax,32
+ jbe NEAR L$087cbc_dec_two
+ movups xmm4,[32+esi]
+ cmp eax,48
+ jbe NEAR L$088cbc_dec_three
+ movups xmm5,[48+esi]
+ cmp eax,64
+ jbe NEAR L$089cbc_dec_four
+ movups xmm6,[64+esi]
+ movaps [esp],xmm7
+ movups xmm2,[esi]
+ xorps xmm7,xmm7
+ call __aesni_decrypt6
+ movups xmm1,[esi]
+ movups xmm0,[16+esi]
+ xorps xmm2,[esp]
+ xorps xmm3,xmm1
+ movups xmm1,[32+esi]
+ xorps xmm4,xmm0
+ movups xmm0,[48+esi]
+ xorps xmm5,xmm1
+ movups xmm7,[64+esi]
+ xorps xmm6,xmm0
+ movups [edi],xmm2
+ movups [16+edi],xmm3
+ pxor xmm3,xmm3
+ movups [32+edi],xmm4
+ pxor xmm4,xmm4
+ movups [48+edi],xmm5
+ pxor xmm5,xmm5
+ lea edi,[64+edi]
+ movaps xmm2,xmm6
+ pxor xmm6,xmm6
+ sub eax,80
+ jmp NEAR L$090cbc_dec_tail_collected
+align 16
+L$086cbc_dec_one:
+ movups xmm0,[edx]
+ movups xmm1,[16+edx]
+ lea edx,[32+edx]
+ xorps xmm2,xmm0
+L$091dec1_loop_16:
+db 102,15,56,222,209
+ dec ecx
+ movups xmm1,[edx]
+ lea edx,[16+edx]
+ jnz NEAR L$091dec1_loop_16
+db 102,15,56,223,209
+ xorps xmm2,xmm7
+ movaps xmm7,xmm6
+ sub eax,16
+ jmp NEAR L$090cbc_dec_tail_collected
+align 16
+L$087cbc_dec_two:
+ call __aesni_decrypt2
+ xorps xmm2,xmm7
+ xorps xmm3,xmm6
+ movups [edi],xmm2
+ movaps xmm2,xmm3
+ pxor xmm3,xmm3
+ lea edi,[16+edi]
+ movaps xmm7,xmm5
+ sub eax,32
+ jmp NEAR L$090cbc_dec_tail_collected
+align 16
+L$088cbc_dec_three:
+ call __aesni_decrypt3
+ xorps xmm2,xmm7
+ xorps xmm3,xmm6
+ xorps xmm4,xmm5
+ movups [edi],xmm2
+ movaps xmm2,xmm4
+ pxor xmm4,xmm4
+ movups [16+edi],xmm3
+ pxor xmm3,xmm3
+ lea edi,[32+edi]
+ movups xmm7,[32+esi]
+ sub eax,48
+ jmp NEAR L$090cbc_dec_tail_collected
+align 16
+L$089cbc_dec_four:
+ call __aesni_decrypt4
+ movups xmm1,[16+esi]
+ movups xmm0,[32+esi]
+ xorps xmm2,xmm7
+ movups xmm7,[48+esi]
+ xorps xmm3,xmm6
+ movups [edi],xmm2
+ xorps xmm4,xmm1
+ movups [16+edi],xmm3
+ pxor xmm3,xmm3
+ xorps xmm5,xmm0
+ movups [32+edi],xmm4
+ pxor xmm4,xmm4
+ lea edi,[48+edi]
+ movaps xmm2,xmm5
+ pxor xmm5,xmm5
+ sub eax,64
+ jmp NEAR L$090cbc_dec_tail_collected
+align 16
+L$085cbc_dec_clear_tail_collected:
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+L$090cbc_dec_tail_collected:
+ and eax,15
+ jnz NEAR L$092cbc_dec_tail_partial
+ movups [edi],xmm2
+ pxor xmm0,xmm0
+ jmp NEAR L$081cbc_ret
+align 16
+L$092cbc_dec_tail_partial:
+ movaps [esp],xmm2
+ pxor xmm0,xmm0
+ mov ecx,16
+ mov esi,esp
+ sub ecx,eax
+dd 2767451785
+ movdqa [esp],xmm2
+L$081cbc_ret:
+ mov esp,DWORD [16+esp]
+ mov ebp,DWORD [36+esp]
+ pxor xmm2,xmm2
+ pxor xmm1,xmm1
+ movups [ebp],xmm7
+ pxor xmm7,xmm7
+L$076cbc_abort:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 16
+__aesni_set_encrypt_key:
+ push ebp
+ push ebx
+ test eax,eax
+ jz NEAR L$093bad_pointer
+ test edx,edx
+ jz NEAR L$093bad_pointer
+ call L$094pic
+L$094pic:
+ pop ebx
+ lea ebx,[(L$key_const-L$094pic)+ebx]
+ lea ebp,[_OPENSSL_ia32cap_P]
+ movups xmm0,[eax]
+ xorps xmm4,xmm4
+ mov ebp,DWORD [4+ebp]
+ lea edx,[16+edx]
+ and ebp,268437504
+ cmp ecx,256
+ je NEAR L$09514rounds
+ cmp ecx,192
+ je NEAR L$09612rounds
+ cmp ecx,128
+ jne NEAR L$097bad_keybits
+align 16
+L$09810rounds:
+ cmp ebp,268435456
+ je NEAR L$09910rounds_alt
+ mov ecx,9
+ movups [edx-16],xmm0
+db 102,15,58,223,200,1
+ call L$100key_128_cold
+db 102,15,58,223,200,2
+ call L$101key_128
+db 102,15,58,223,200,4
+ call L$101key_128
+db 102,15,58,223,200,8
+ call L$101key_128
+db 102,15,58,223,200,16
+ call L$101key_128
+db 102,15,58,223,200,32
+ call L$101key_128
+db 102,15,58,223,200,64
+ call L$101key_128
+db 102,15,58,223,200,128
+ call L$101key_128
+db 102,15,58,223,200,27
+ call L$101key_128
+db 102,15,58,223,200,54
+ call L$101key_128
+ movups [edx],xmm0
+ mov DWORD [80+edx],ecx
+ jmp NEAR L$102good_key
+align 16
+L$101key_128:
+ movups [edx],xmm0
+ lea edx,[16+edx]
+L$100key_128_cold:
+ shufps xmm4,xmm0,16
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ xorps xmm0,xmm4
+ shufps xmm1,xmm1,255
+ xorps xmm0,xmm1
+ ret
+align 16
+L$09910rounds_alt:
+ movdqa xmm5,[ebx]
+ mov ecx,8
+ movdqa xmm4,[32+ebx]
+ movdqa xmm2,xmm0
+ movdqu [edx-16],xmm0
+L$103loop_key128:
+db 102,15,56,0,197
+db 102,15,56,221,196
+ pslld xmm4,1
+ lea edx,[16+edx]
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu [edx-16],xmm0
+ movdqa xmm2,xmm0
+ dec ecx
+ jnz NEAR L$103loop_key128
+ movdqa xmm4,[48+ebx]
+db 102,15,56,0,197
+db 102,15,56,221,196
+ pslld xmm4,1
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu [edx],xmm0
+ movdqa xmm2,xmm0
+db 102,15,56,0,197
+db 102,15,56,221,196
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+ pxor xmm0,xmm2
+ movdqu [16+edx],xmm0
+ mov ecx,9
+ mov DWORD [96+edx],ecx
+ jmp NEAR L$102good_key
+align 16
+L$09612rounds:
+ movq xmm2,[16+eax]
+ cmp ebp,268435456
+ je NEAR L$10412rounds_alt
+ mov ecx,11
+ movups [edx-16],xmm0
+db 102,15,58,223,202,1
+ call L$105key_192a_cold
+db 102,15,58,223,202,2
+ call L$106key_192b
+db 102,15,58,223,202,4
+ call L$107key_192a
+db 102,15,58,223,202,8
+ call L$106key_192b
+db 102,15,58,223,202,16
+ call L$107key_192a
+db 102,15,58,223,202,32
+ call L$106key_192b
+db 102,15,58,223,202,64
+ call L$107key_192a
+db 102,15,58,223,202,128
+ call L$106key_192b
+ movups [edx],xmm0
+ mov DWORD [48+edx],ecx
+ jmp NEAR L$102good_key
+align 16
+L$107key_192a:
+ movups [edx],xmm0
+ lea edx,[16+edx]
+align 16
+L$105key_192a_cold:
+ movaps xmm5,xmm2
+L$108key_192b_warm:
+ shufps xmm4,xmm0,16
+ movdqa xmm3,xmm2
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ pslldq xmm3,4
+ xorps xmm0,xmm4
+ pshufd xmm1,xmm1,85
+ pxor xmm2,xmm3
+ pxor xmm0,xmm1
+ pshufd xmm3,xmm0,255
+ pxor xmm2,xmm3
+ ret
+align 16
+L$106key_192b:
+ movaps xmm3,xmm0
+ shufps xmm5,xmm0,68
+ movups [edx],xmm5
+ shufps xmm3,xmm2,78
+ movups [16+edx],xmm3
+ lea edx,[32+edx]
+ jmp NEAR L$108key_192b_warm
+align 16
+L$10412rounds_alt:
+ movdqa xmm5,[16+ebx]
+ movdqa xmm4,[32+ebx]
+ mov ecx,8
+ movdqu [edx-16],xmm0
+L$109loop_key192:
+ movq [edx],xmm2
+ movdqa xmm1,xmm2
+db 102,15,56,0,213
+db 102,15,56,221,212
+ pslld xmm4,1
+ lea edx,[24+edx]
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pshufd xmm3,xmm0,255
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pxor xmm0,xmm2
+ pxor xmm2,xmm3
+ movdqu [edx-16],xmm0
+ dec ecx
+ jnz NEAR L$109loop_key192
+ mov ecx,11
+ mov DWORD [32+edx],ecx
+ jmp NEAR L$102good_key
+align 16
+L$09514rounds:
+ movups xmm2,[16+eax]
+ lea edx,[16+edx]
+ cmp ebp,268435456
+ je NEAR L$11014rounds_alt
+ mov ecx,13
+ movups [edx-32],xmm0
+ movups [edx-16],xmm2
+db 102,15,58,223,202,1
+ call L$111key_256a_cold
+db 102,15,58,223,200,1
+ call L$112key_256b
+db 102,15,58,223,202,2
+ call L$113key_256a
+db 102,15,58,223,200,2
+ call L$112key_256b
+db 102,15,58,223,202,4
+ call L$113key_256a
+db 102,15,58,223,200,4
+ call L$112key_256b
+db 102,15,58,223,202,8
+ call L$113key_256a
+db 102,15,58,223,200,8
+ call L$112key_256b
+db 102,15,58,223,202,16
+ call L$113key_256a
+db 102,15,58,223,200,16
+ call L$112key_256b
+db 102,15,58,223,202,32
+ call L$113key_256a
+db 102,15,58,223,200,32
+ call L$112key_256b
+db 102,15,58,223,202,64
+ call L$113key_256a
+ movups [edx],xmm0
+ mov DWORD [16+edx],ecx
+ xor eax,eax
+ jmp NEAR L$102good_key
+align 16
+L$113key_256a:
+ movups [edx],xmm2
+ lea edx,[16+edx]
+L$111key_256a_cold:
+ shufps xmm4,xmm0,16
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ xorps xmm0,xmm4
+ shufps xmm1,xmm1,255
+ xorps xmm0,xmm1
+ ret
+align 16
+L$112key_256b:
+ movups [edx],xmm0
+ lea edx,[16+edx]
+ shufps xmm4,xmm2,16
+ xorps xmm2,xmm4
+ shufps xmm4,xmm2,140
+ xorps xmm2,xmm4
+ shufps xmm1,xmm1,170
+ xorps xmm2,xmm1
+ ret
+align 16
+L$11014rounds_alt:
+ movdqa xmm5,[ebx]
+ movdqa xmm4,[32+ebx]
+ mov ecx,7
+ movdqu [edx-32],xmm0
+ movdqa xmm1,xmm2
+ movdqu [edx-16],xmm2
+L$114loop_key256:
+db 102,15,56,0,213
+db 102,15,56,221,212
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pslld xmm4,1
+ pxor xmm0,xmm2
+ movdqu [edx],xmm0
+ dec ecx
+ jz NEAR L$115done_key256
+ pshufd xmm2,xmm0,255
+ pxor xmm3,xmm3
+db 102,15,56,221,211
+ movdqa xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm1,xmm3
+ pxor xmm2,xmm1
+ movdqu [16+edx],xmm2
+ lea edx,[32+edx]
+ movdqa xmm1,xmm2
+ jmp NEAR L$114loop_key256
+L$115done_key256:
+ mov ecx,13
+ mov DWORD [16+edx],ecx
+L$102good_key:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ xor eax,eax
+ pop ebx
+ pop ebp
+ ret
+align 4
+L$093bad_pointer:
+ mov eax,-1
+ pop ebx
+ pop ebp
+ ret
+align 4
+L$097bad_keybits:
+ pxor xmm0,xmm0
+ mov eax,-2
+ pop ebx
+ pop ebp
+ ret
+global _aes_hw_set_encrypt_key
+align 16
+_aes_hw_set_encrypt_key:
+L$_aes_hw_set_encrypt_key_begin:
+%ifdef BORINGSSL_DISPATCH_TEST
+ push ebx
+ push edx
+ call L$116pic
+L$116pic:
+ pop ebx
+ lea ebx,[(_BORINGSSL_function_hit+3-L$116pic)+ebx]
+ mov edx,1
+ mov BYTE [ebx],dl
+ pop edx
+ pop ebx
+%endif
+ mov eax,DWORD [4+esp]
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ call __aesni_set_encrypt_key
+ ret
+global _aes_hw_set_decrypt_key
+align 16
+_aes_hw_set_decrypt_key:
+L$_aes_hw_set_decrypt_key_begin:
+ mov eax,DWORD [4+esp]
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ call __aesni_set_encrypt_key
+ mov edx,DWORD [12+esp]
+ shl ecx,4
+ test eax,eax
+ jnz NEAR L$117dec_key_ret
+ lea eax,[16+ecx*1+edx]
+ movups xmm0,[edx]
+ movups xmm1,[eax]
+ movups [eax],xmm0
+ movups [edx],xmm1
+ lea edx,[16+edx]
+ lea eax,[eax-16]
+L$118dec_key_inverse:
+ movups xmm0,[edx]
+ movups xmm1,[eax]
+db 102,15,56,219,192
+db 102,15,56,219,201
+ lea edx,[16+edx]
+ lea eax,[eax-16]
+ movups [16+eax],xmm0
+ movups [edx-16],xmm1
+ cmp eax,edx
+ ja NEAR L$118dec_key_inverse
+ movups xmm0,[edx]
+db 102,15,56,219,192
+ movups [edx],xmm0
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ xor eax,eax
+L$117dec_key_ret:
+ ret
+align 64
+L$key_const:
+dd 202313229,202313229,202313229,202313229
+dd 67569157,67569157,67569157,67569157
+dd 1,1,1,1
+dd 27,27,27,27
+db 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+db 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+db 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+db 115,108,46,111,114,103,62,0
+segment .bss
+common _OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/aesni-x86_64-apple.S b/gen/bcm/aesni-x86_64-apple.S
new file mode 100644
index 0000000..f3505b9
--- /dev/null
+++ b/gen/bcm/aesni-x86_64-apple.S
@@ -0,0 +1,2359 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+.globl _aes_hw_encrypt
+.private_extern _aes_hw_encrypt
+
+.p2align 4
+_aes_hw_encrypt:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+ movb $1,_BORINGSSL_function_hit+1(%rip)
+#endif
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+L$oop_enc1_1:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz L$oop_enc1_1
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ ret
+
+
+
+.globl _aes_hw_decrypt
+.private_extern _aes_hw_decrypt
+
+.p2align 4
+_aes_hw_decrypt:
+
+_CET_ENDBR
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+L$oop_dec1_2:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz L$oop_dec1_2
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ ret
+
+
+
+.p2align 4
+_aesni_encrypt2:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$enc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+
+
+
+.p2align 4
+_aesni_decrypt2:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$dec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+
+
+
+.p2align 4
+_aesni_encrypt3:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$enc_loop3:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop3
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ ret
+
+
+
+.p2align 4
+_aesni_decrypt3:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$dec_loop3:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop3
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ ret
+
+
+
+.p2align 4
+_aesni_encrypt4:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+L$enc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ ret
+
+
+
+.p2align 4
+_aesni_decrypt4:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+L$dec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ ret
+
+
+
+.p2align 4
+_aesni_encrypt6:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$enc_loop6_enter
+.p2align 4
+L$enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+L$enc_loop6_enter:
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ ret
+
+
+
+.p2align 4
+_aesni_decrypt6:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$dec_loop6_enter
+.p2align 4
+L$dec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+L$dec_loop6_enter:
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ ret
+
+
+
+.p2align 4
+_aesni_encrypt8:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm9
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$enc_loop8_inner
+.p2align 4
+L$enc_loop8:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+L$enc_loop8_inner:
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+L$enc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop8
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+.byte 102,68,15,56,221,192
+.byte 102,68,15,56,221,200
+ ret
+
+
+
+.p2align 4
+_aesni_decrypt8:
+
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm9
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp L$dec_loop8_inner
+.p2align 4
+L$dec_loop8:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+L$dec_loop8_inner:
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+L$dec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop8
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+.byte 102,68,15,56,223,192
+.byte 102,68,15,56,223,200
+ ret
+
+
+.globl _aes_hw_ecb_encrypt
+.private_extern _aes_hw_ecb_encrypt
+
+.p2align 4
+_aes_hw_ecb_encrypt:
+
+_CET_ENDBR
+ andq $-16,%rdx
+ jz L$ecb_ret
+
+ movl 240(%rcx),%eax
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %eax,%r10d
+ testl %r8d,%r8d
+ jz L$ecb_decrypt
+
+ cmpq $0x80,%rdx
+ jb L$ecb_enc_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $0x80,%rdx
+ jmp L$ecb_enc_loop8_enter
+.p2align 4
+L$ecb_enc_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+L$ecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ subq $0x80,%rdx
+ jnc L$ecb_enc_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $0x80,%rdx
+ jz L$ecb_ret
+
+L$ecb_enc_tail:
+ movups (%rdi),%xmm2
+ cmpq $0x20,%rdx
+ jb L$ecb_enc_one
+ movups 16(%rdi),%xmm3
+ je L$ecb_enc_two
+ movups 32(%rdi),%xmm4
+ cmpq $0x40,%rdx
+ jb L$ecb_enc_three
+ movups 48(%rdi),%xmm5
+ je L$ecb_enc_four
+ movups 64(%rdi),%xmm6
+ cmpq $0x60,%rdx
+ jb L$ecb_enc_five
+ movups 80(%rdi),%xmm7
+ je L$ecb_enc_six
+ movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_encrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_3:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_two:
+ call _aesni_encrypt2
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_five:
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_six:
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ jmp L$ecb_ret
+
+.p2align 4
+L$ecb_decrypt:
+ cmpq $0x80,%rdx
+ jb L$ecb_dec_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $0x80,%rdx
+ jmp L$ecb_dec_loop8_enter
+.p2align 4
+L$ecb_dec_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+L$ecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ movups (%r11),%xmm0
+ subq $0x80,%rdx
+ jnc L$ecb_dec_loop8
+
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ addq $0x80,%rdx
+ jz L$ecb_ret
+
+L$ecb_dec_tail:
+ movups (%rdi),%xmm2
+ cmpq $0x20,%rdx
+ jb L$ecb_dec_one
+ movups 16(%rdi),%xmm3
+ je L$ecb_dec_two
+ movups 32(%rdi),%xmm4
+ cmpq $0x40,%rdx
+ jb L$ecb_dec_three
+ movups 48(%rdi),%xmm5
+ je L$ecb_dec_four
+ movups 64(%rdi),%xmm6
+ cmpq $0x60,%rdx
+ jb L$ecb_dec_five
+ movups 80(%rdi),%xmm7
+ je L$ecb_dec_six
+ movups 96(%rdi),%xmm8
+ movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_4:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_two:
+ call _aesni_decrypt2
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_six:
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+
+L$ecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ ret
+
+
+.globl _aes_hw_ctr32_encrypt_blocks
+.private_extern _aes_hw_ctr32_encrypt_blocks
+
+.p2align 4
+_aes_hw_ctr32_encrypt_blocks:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,_BORINGSSL_function_hit(%rip)
+#endif
+ cmpq $1,%rdx
+ jne L$ctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_5:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_5
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp L$ctr32_epilogue
+
+.p2align 4
+L$ctr32_bulk:
+ leaq (%rsp),%r11
+
+ pushq %rbp
+
+ subq $128,%rsp
+ andq $-16,%rsp
+
+
+
+
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%ebp
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
+ movdqa %xmm2,112(%rsp)
+
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %ebp,%eax
+ xorl %ebp,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
+ movdqa %xmm3,16(%rsp)
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %ebp,%eax
+ bswapl %r10d
+.byte 102,15,58,34,232,3
+ xorl %ebp,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
+ xorl %ebp,%r9d
+ bswapl %r10d
+ movl %r9d,80+12(%rsp)
+ xorl %ebp,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ xorl %ebp,%r9d
+ movl %r9d,112+12(%rsp)
+
+ movups 16(%rcx),%xmm1
+
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+
+ cmpq $8,%rdx
+ jb L$ctr32_tail
+
+ leaq 128(%rcx),%rcx
+ subq $8,%rdx
+ jmp L$ctr32_loop8
+
+.p2align 5
+L$ctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
+.byte 102,15,56,220,209
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
+.byte 102,15,56,220,217
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
+.byte 102,15,56,220,225
+ xorl %ebp,%r9d
+ nop
+.byte 102,15,56,220,233
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ xorl %ebp,%r9d
+ movdqu 0(%rdi),%xmm10
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ jb L$ctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je L$ctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+ jmp L$ctr32_enc_done
+
+.p2align 4
+L$ctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+ prefetcht0 448(%rdi)
+ prefetcht0 512(%rdi)
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112-128(%rdi),%xmm10
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+ movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc L$ctr32_loop8
+
+ addq $8,%rdx
+ jz L$ctr32_done
+ leaq -128(%rcx),%rcx
+
+L$ctr32_tail:
+
+
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb L$ctr32_loop3
+ je L$ctr32_loop4
+
+
+ shll $4,%eax
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
+
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,225
+ addq $16,%rax
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+
+ call L$enc_loop8_enter
+
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb L$ctr32_done
+
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je L$ctr32_done
+
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ jnz L$ctr32_loop4
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ jnz L$ctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb L$ctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je L$ctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
+
+L$ctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %ebp,%ebp
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
+ movq -8(%r11),%rbp
+
+ leaq (%r11),%rsp
+
+L$ctr32_epilogue:
+ ret
+
+
+.globl _aes_hw_cbc_encrypt
+.private_extern _aes_hw_cbc_encrypt
+
+.p2align 4
+_aes_hw_cbc_encrypt:
+
+_CET_ENDBR
+ testq %rdx,%rdx
+ jz L$cbc_ret
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ testl %r9d,%r9d
+ jz L$cbc_decrypt
+
+ movups (%r8),%xmm2
+ movl %r10d,%eax
+ cmpq $16,%rdx
+ jb L$cbc_enc_tail
+ subq $16,%rdx
+ jmp L$cbc_enc_loop
+.p2align 4
+L$cbc_enc_loop:
+ movups (%rdi),%xmm3
+ leaq 16(%rdi),%rdi
+
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm3
+ leaq 32(%rcx),%rcx
+ xorps %xmm3,%xmm2
+L$oop_enc1_6:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_6
+.byte 102,15,56,221,209
+ movl %r10d,%eax
+ movq %r11,%rcx
+ movups %xmm2,0(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $16,%rdx
+ jnc L$cbc_enc_loop
+ addq $16,%rdx
+ jnz L$cbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ jmp L$cbc_ret
+
+L$cbc_enc_tail:
+ movq %rdx,%rcx
+ xchgq %rdi,%rsi
+.long 0x9066A4F3
+ movl $16,%ecx
+ subq %rdx,%rcx
+ xorl %eax,%eax
+.long 0x9066AAF3
+ leaq -16(%rdi),%rdi
+ movl %r10d,%eax
+ movq %rdi,%rsi
+ movq %r11,%rcx
+ xorq %rdx,%rdx
+ jmp L$cbc_enc_loop
+
+.p2align 4
+L$cbc_decrypt:
+ cmpq $16,%rdx
+ jne L$cbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_7:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_7
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp L$cbc_ret
+.p2align 4
+L$cbc_decrypt_bulk:
+ leaq (%rsp),%r11
+
+ pushq %rbp
+
+ subq $16,%rsp
+ andq $-16,%rsp
+ movq %rcx,%rbp
+ movups (%r8),%xmm10
+ movl %r10d,%eax
+ cmpq $0x50,%rdx
+ jbe L$cbc_dec_tail
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ cmpq $0x70,%rdx
+ jbe L$cbc_dec_six_or_seven
+
+ subq $0x70,%rdx
+ leaq 112(%rcx),%rcx
+ jmp L$cbc_dec_loop8_enter
+.p2align 4
+L$cbc_dec_loop8:
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+L$cbc_dec_loop8_enter:
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ movq $-1,%rbp
+ cmpq $0x70,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+ adcq $0,%rbp
+ andq $128,%rbp
+.byte 102,68,15,56,222,201
+ addq %rdi,%rbp
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ jb L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_done:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
+ movdqu 0(%rbp),%xmm11
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+ movdqu 16(%rbp),%xmm12
+ movdqu 32(%rbp),%xmm13
+.byte 102,65,15,56,223,255
+.byte 102,68,15,56,223,193
+ movdqu 48(%rbp),%xmm14
+ movdqu 64(%rbp),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%rbp),%xmm1
+ movups -112(%rcx),%xmm0
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm1,%xmm7
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
+
+ subq $0x80,%rdx
+ ja L$cbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+ leaq -112(%rcx),%rcx
+ addq $0x70,%rdx
+ jle L$cbc_dec_clear_tail_collected
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ cmpq $0x50,%rdx
+ jbe L$cbc_dec_tail
+
+ movaps %xmm11,%xmm2
+L$cbc_dec_six_or_seven:
+ cmpq $0x60,%rdx
+ ja L$cbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ jmp L$cbc_dec_tail_collected
+
+L$cbc_dec_tail:
+ movups (%rdi),%xmm2
+ subq $0x10,%rdx
+ jbe L$cbc_dec_one
+
+ movups 16(%rdi),%xmm3
+ movaps %xmm2,%xmm11
+ subq $0x10,%rdx
+ jbe L$cbc_dec_two
+
+ movups 32(%rdi),%xmm4
+ movaps %xmm3,%xmm12
+ subq $0x10,%rdx
+ jbe L$cbc_dec_three
+
+ movups 48(%rdi),%xmm5
+ movaps %xmm4,%xmm13
+ subq $0x10,%rdx
+ jbe L$cbc_dec_four
+
+ movups 64(%rdi),%xmm6
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_one:
+ movaps %xmm2,%xmm11
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_8:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_8
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
+ jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_two:
+ movaps %xmm3,%xmm12
+ call _aesni_decrypt2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ leaq 16(%rsi),%rsi
+ jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_three:
+ movaps %xmm4,%xmm13
+ call _aesni_decrypt3
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
+ leaq 32(%rsi),%rsi
+ jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_four:
+ movaps %xmm5,%xmm14
+ call _aesni_decrypt4
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
+ leaq 48(%rsi),%rsi
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+L$cbc_dec_tail_collected:
+ movups %xmm10,(%r8)
+ andq $15,%rdx
+ jnz L$cbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp L$cbc_dec_ret
+.p2align 4
+L$cbc_dec_tail_partial:
+ movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+ leaq (%rsp),%rsi
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
+
+L$cbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movq -8(%r11),%rbp
+
+ leaq (%r11),%rsp
+
+L$cbc_ret:
+ ret
+
+
+.globl _aes_hw_set_decrypt_key
+.private_extern _aes_hw_set_decrypt_key
+
+.p2align 4
+_aes_hw_set_decrypt_key:
+
+_CET_ENDBR
+.byte 0x48,0x83,0xEC,0x08
+
+ call __aesni_set_encrypt_key
+ shll $4,%esi
+ testl %eax,%eax
+ jnz L$dec_key_ret
+ leaq 16(%rdx,%rsi,1),%rdi
+
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+ movups %xmm0,(%rdi)
+ movups %xmm1,(%rdx)
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+
+L$dec_key_inverse:
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+ movups %xmm0,16(%rdi)
+ movups %xmm1,-16(%rdx)
+ cmpq %rdx,%rdi
+ ja L$dec_key_inverse
+
+ movups (%rdx),%xmm0
+.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
+ movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
+L$dec_key_ret:
+ addq $8,%rsp
+
+ ret
+
+L$SEH_end_set_decrypt_key:
+
+.globl _aes_hw_set_encrypt_key
+.private_extern _aes_hw_set_encrypt_key
+
+.p2align 4
+_aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,_BORINGSSL_function_hit+3(%rip)
+#endif
+.byte 0x48,0x83,0xEC,0x08
+
+ movq $-1,%rax
+ testq %rdi,%rdi
+ jz L$enc_key_ret
+ testq %rdx,%rdx
+ jz L$enc_key_ret
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq _OPENSSL_ia32cap_P(%rip),%r10
+ movl 4(%r10),%r10d
+ andl $268437504,%r10d
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je L$14rounds
+ cmpl $192,%esi
+ je L$12rounds
+ cmpl $128,%esi
+ jne L$bad_keybits
+
+L$10rounds:
+ movl $9,%esi
+ cmpl $268435456,%r10d
+ je L$10rounds_alt
+
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,200,1
+ call L$key_expansion_128_cold
+.byte 102,15,58,223,200,2
+ call L$key_expansion_128
+.byte 102,15,58,223,200,4
+ call L$key_expansion_128
+.byte 102,15,58,223,200,8
+ call L$key_expansion_128
+.byte 102,15,58,223,200,16
+ call L$key_expansion_128
+.byte 102,15,58,223,200,32
+ call L$key_expansion_128
+.byte 102,15,58,223,200,64
+ call L$key_expansion_128
+.byte 102,15,58,223,200,128
+ call L$key_expansion_128
+.byte 102,15,58,223,200,27
+ call L$key_expansion_128
+.byte 102,15,58,223,200,54
+ call L$key_expansion_128
+ movups %xmm0,(%rax)
+ movl %esi,80(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$10rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa L$key_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key128
+
+.p2align 4
+L$oop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz L$oop_key128
+
+ movdqa L$key_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$12rounds:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ cmpl $268435456,%r10d
+ je L$12rounds_alt
+
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,202,1
+ call L$key_expansion_192a_cold
+.byte 102,15,58,223,202,2
+ call L$key_expansion_192b
+.byte 102,15,58,223,202,4
+ call L$key_expansion_192a
+.byte 102,15,58,223,202,8
+ call L$key_expansion_192b
+.byte 102,15,58,223,202,16
+ call L$key_expansion_192a
+.byte 102,15,58,223,202,32
+ call L$key_expansion_192b
+.byte 102,15,58,223,202,64
+ call L$key_expansion_192a
+.byte 102,15,58,223,202,128
+ call L$key_expansion_192b
+ movups %xmm0,(%rax)
+ movl %esi,48(%rax)
+ xorq %rax,%rax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$12rounds_alt:
+ movdqa L$key_rotate192(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp L$oop_key192
+
+.p2align 4
+L$oop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz L$oop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$14rounds:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je L$14rounds_alt
+
+ movups %xmm0,(%rdx)
+ movups %xmm2,16(%rdx)
+.byte 102,15,58,223,202,1
+ call L$key_expansion_256a_cold
+.byte 102,15,58,223,200,1
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,2
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,2
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,4
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,4
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,8
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,8
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,16
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,16
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,32
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,32
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,64
+ call L$key_expansion_256a
+ movups %xmm0,(%rax)
+ movl %esi,16(%rax)
+ xorq %rax,%rax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$14rounds_alt:
+ movdqa L$key_rotate(%rip),%xmm5
+ movdqa L$key_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp L$oop_key256
+
+.p2align 4
+L$oop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz L$done_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp L$oop_key256
+
+L$done_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$bad_keybits:
+ movq $-2,%rax
+L$enc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ addq $8,%rsp
+
+ ret
+
+L$SEH_end_set_encrypt_key:
+
+.p2align 4
+L$key_expansion_128:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+L$key_expansion_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+
+.p2align 4
+L$key_expansion_192a:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+L$key_expansion_192a_cold:
+ movaps %xmm2,%xmm5
+L$key_expansion_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+
+.p2align 4
+L$key_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%rax)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%rax)
+ leaq 32(%rax),%rax
+ jmp L$key_expansion_192b_warm
+
+.p2align 4
+L$key_expansion_256a:
+ movups %xmm2,(%rax)
+ leaq 16(%rax),%rax
+L$key_expansion_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+
+.p2align 4
+L$key_expansion_256b:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+
+
+.section __DATA,__const
+.p2align 6
+L$bswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$increment32:
+.long 6,6,6,0
+L$increment64:
+.long 1,0,0,0
+L$xts_magic:
+.long 0x87,0,1,0
+L$increment1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$key_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+L$key_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+L$key_rcon1:
+.long 1,1,1,1
+L$key_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
+
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+.text
+#endif
diff --git a/gen/bcm/aesni-x86_64-linux.S b/gen/bcm/aesni-x86_64-linux.S
new file mode 100644
index 0000000..68742fb
--- /dev/null
+++ b/gen/bcm/aesni-x86_64-linux.S
@@ -0,0 +1,2361 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,@function
+.align 16
+aes_hw_encrypt:
+.cfi_startproc
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+1(%rip)
+#endif
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_enc1_1:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_enc1_1
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ ret
+.cfi_endproc
+.size aes_hw_encrypt,.-aes_hw_encrypt
+
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,@function
+.align 16
+aes_hw_decrypt:
+.cfi_startproc
+_CET_ENDBR
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_dec1_2:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_dec1_2
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ ret
+.cfi_endproc
+.size aes_hw_decrypt, .-aes_hw_decrypt
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.cfi_endproc
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.cfi_endproc
+.size _aesni_decrypt2,.-_aesni_decrypt2
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop3:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop3
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ ret
+.cfi_endproc
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop3:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop3
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ ret
+.cfi_endproc
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+.Lenc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ ret
+.cfi_endproc
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+.Ldec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ ret
+.cfi_endproc
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop6_enter
+.align 16
+.Lenc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.Lenc_loop6_enter:
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ ret
+.cfi_endproc
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop6_enter
+.align 16
+.Ldec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.Ldec_loop6_enter:
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ ret
+.cfi_endproc
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.type _aesni_encrypt8,@function
+.align 16
+_aesni_encrypt8:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm9
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
+.align 16
+.Lenc_loop8:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.Lenc_loop8_inner:
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop8
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+.byte 102,68,15,56,221,192
+.byte 102,68,15,56,221,200
+ ret
+.cfi_endproc
+.size _aesni_encrypt8,.-_aesni_encrypt8
+.type _aesni_decrypt8,@function
+.align 16
+_aesni_decrypt8:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm9
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
+.align 16
+.Ldec_loop8:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.Ldec_loop8_inner:
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop8
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+.byte 102,68,15,56,223,192
+.byte 102,68,15,56,223,200
+ ret
+.cfi_endproc
+.size _aesni_decrypt8,.-_aesni_decrypt8
+.globl aes_hw_ecb_encrypt
+.hidden aes_hw_ecb_encrypt
+.type aes_hw_ecb_encrypt,@function
+.align 16
+aes_hw_ecb_encrypt:
+.cfi_startproc
+_CET_ENDBR
+ andq $-16,%rdx
+ jz .Lecb_ret
+
+ movl 240(%rcx),%eax
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %eax,%r10d
+ testl %r8d,%r8d
+ jz .Lecb_decrypt
+
+ cmpq $0x80,%rdx
+ jb .Lecb_enc_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $0x80,%rdx
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ subq $0x80,%rdx
+ jnc .Lecb_enc_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $0x80,%rdx
+ jz .Lecb_ret
+
+.Lecb_enc_tail:
+ movups (%rdi),%xmm2
+ cmpq $0x20,%rdx
+ jb .Lecb_enc_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_enc_two
+ movups 32(%rdi),%xmm4
+ cmpq $0x40,%rdx
+ jb .Lecb_enc_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_enc_four
+ movups 64(%rdi),%xmm6
+ cmpq $0x60,%rdx
+ jb .Lecb_enc_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_enc_six
+ movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_encrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_3:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ call _aesni_encrypt2
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ jmp .Lecb_ret
+
+.align 16
+.Lecb_decrypt:
+ cmpq $0x80,%rdx
+ jb .Lecb_dec_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $0x80,%rdx
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ movups (%r11),%xmm0
+ subq $0x80,%rdx
+ jnc .Lecb_dec_loop8
+
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ addq $0x80,%rdx
+ jz .Lecb_ret
+
+.Lecb_dec_tail:
+ movups (%rdi),%xmm2
+ cmpq $0x20,%rdx
+ jb .Lecb_dec_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_dec_two
+ movups 32(%rdi),%xmm4
+ cmpq $0x40,%rdx
+ jb .Lecb_dec_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_dec_four
+ movups 64(%rdi),%xmm6
+ cmpq $0x60,%rdx
+ jb .Lecb_dec_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_dec_six
+ movups 96(%rdi),%xmm8
+ movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_4:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ call _aesni_decrypt2
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+
+.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ ret
+.cfi_endproc
+.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,@function
+.align 16
+aes_hw_ctr32_encrypt_blocks:
+.cfi_startproc
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,BORINGSSL_function_hit(%rip)
+#endif
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_5:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_5
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+ leaq (%rsp),%r11
+.cfi_def_cfa_register %r11
+ pushq %rbp
+.cfi_offset %rbp,-16
+ subq $128,%rsp
+ andq $-16,%rsp
+
+
+
+
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%ebp
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
+ movdqa %xmm2,112(%rsp)
+
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %ebp,%eax
+ xorl %ebp,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
+ movdqa %xmm3,16(%rsp)
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %ebp,%eax
+ bswapl %r10d
+.byte 102,15,58,34,232,3
+ xorl %ebp,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
+ xorl %ebp,%r9d
+ bswapl %r10d
+ movl %r9d,80+12(%rsp)
+ xorl %ebp,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ xorl %ebp,%r9d
+ movl %r9d,112+12(%rsp)
+
+ movups 16(%rcx),%xmm1
+
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+
+ cmpq $8,%rdx
+ jb .Lctr32_tail
+
+ leaq 128(%rcx),%rcx
+ subq $8,%rdx
+ jmp .Lctr32_loop8
+
+.align 32
+.Lctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
+.byte 102,15,56,220,209
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
+.byte 102,15,56,220,217
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
+.byte 102,15,56,220,225
+ xorl %ebp,%r9d
+ nop
+.byte 102,15,56,220,233
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ xorl %ebp,%r9d
+ movdqu 0(%rdi),%xmm10
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ jb .Lctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je .Lctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+ prefetcht0 448(%rdi)
+ prefetcht0 512(%rdi)
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112-128(%rdi),%xmm10
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+ movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc .Lctr32_loop8
+
+ addq $8,%rdx
+ jz .Lctr32_done
+ leaq -128(%rcx),%rcx
+
+.Lctr32_tail:
+
+
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
+
+
+ shll $4,%eax
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
+
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,225
+ addq $16,%rax
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+
+ call .Lenc_loop8_enter
+
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb .Lctr32_done
+
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je .Lctr32_done
+
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop4
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb .Lctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je .Lctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
+
+.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %ebp,%ebp
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
+ movq -8(%r11),%rbp
+.cfi_restore %rbp
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lctr32_epilogue:
+ ret
+.cfi_endproc
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,@function
+.align 16
+aes_hw_cbc_encrypt:
+.cfi_startproc
+_CET_ENDBR
+ testq %rdx,%rdx
+ jz .Lcbc_ret
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ testl %r9d,%r9d
+ jz .Lcbc_decrypt
+
+ movups (%r8),%xmm2
+ movl %r10d,%eax
+ cmpq $16,%rdx
+ jb .Lcbc_enc_tail
+ subq $16,%rdx
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups (%rdi),%xmm3
+ leaq 16(%rdi),%rdi
+
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm3
+ leaq 32(%rcx),%rcx
+ xorps %xmm3,%xmm2
+.Loop_enc1_6:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_6
+.byte 102,15,56,221,209
+ movl %r10d,%eax
+ movq %r11,%rcx
+ movups %xmm2,0(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $16,%rdx
+ jnc .Lcbc_enc_loop
+ addq $16,%rdx
+ jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ movq %rdx,%rcx
+ xchgq %rdi,%rsi
+.long 0x9066A4F3
+ movl $16,%ecx
+ subq %rdx,%rcx
+ xorl %eax,%eax
+.long 0x9066AAF3
+ leaq -16(%rdi),%rdi
+ movl %r10d,%eax
+ movq %rdi,%rsi
+ movq %r11,%rcx
+ xorq %rdx,%rdx
+ jmp .Lcbc_enc_loop
+
+.align 16
+.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_7:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_7
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+ leaq (%rsp),%r11
+.cfi_def_cfa_register %r11
+ pushq %rbp
+.cfi_offset %rbp,-16
+ subq $16,%rsp
+ andq $-16,%rsp
+ movq %rcx,%rbp
+ movups (%r8),%xmm10
+ movl %r10d,%eax
+ cmpq $0x50,%rdx
+ jbe .Lcbc_dec_tail
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ cmpq $0x70,%rdx
+ jbe .Lcbc_dec_six_or_seven
+
+ subq $0x70,%rdx
+ leaq 112(%rcx),%rcx
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ movq $-1,%rbp
+ cmpq $0x70,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+ adcq $0,%rbp
+ andq $128,%rbp
+.byte 102,68,15,56,222,201
+ addq %rdi,%rbp
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ jb .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_done:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
+ movdqu 0(%rbp),%xmm11
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+ movdqu 16(%rbp),%xmm12
+ movdqu 32(%rbp),%xmm13
+.byte 102,65,15,56,223,255
+.byte 102,68,15,56,223,193
+ movdqu 48(%rbp),%xmm14
+ movdqu 64(%rbp),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%rbp),%xmm1
+ movups -112(%rcx),%xmm0
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm1,%xmm7
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
+
+ subq $0x80,%rdx
+ ja .Lcbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+ leaq -112(%rcx),%rcx
+ addq $0x70,%rdx
+ jle .Lcbc_dec_clear_tail_collected
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ cmpq $0x50,%rdx
+ jbe .Lcbc_dec_tail
+
+ movaps %xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+ cmpq $0x60,%rdx
+ ja .Lcbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ jmp .Lcbc_dec_tail_collected
+
+.Lcbc_dec_tail:
+ movups (%rdi),%xmm2
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_one
+
+ movups 16(%rdi),%xmm3
+ movaps %xmm2,%xmm11
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_two
+
+ movups 32(%rdi),%xmm4
+ movaps %xmm3,%xmm12
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_three
+
+ movups 48(%rdi),%xmm5
+ movaps %xmm4,%xmm13
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_four
+
+ movups 64(%rdi),%xmm6
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_one:
+ movaps %xmm2,%xmm11
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_8:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_8
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ movaps %xmm3,%xmm12
+ call _aesni_decrypt2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ leaq 16(%rsi),%rsi
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ movaps %xmm4,%xmm13
+ call _aesni_decrypt3
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
+ leaq 32(%rsi),%rsi
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ movaps %xmm5,%xmm14
+ call _aesni_decrypt4
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
+ leaq 48(%rsi),%rsi
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+.Lcbc_dec_tail_collected:
+ movups %xmm10,(%r8)
+ andq $15,%rdx
+ jnz .Lcbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+ leaq (%rsp),%rsi
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
+
+.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movq -8(%r11),%rbp
+.cfi_restore %rbp
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lcbc_ret:
+ ret
+.cfi_endproc
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,@function
+.align 16
+aes_hw_set_decrypt_key:
+.cfi_startproc
+_CET_ENDBR
+.byte 0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset 8
+ call __aesni_set_encrypt_key
+ shll $4,%esi
+ testl %eax,%eax
+ jnz .Ldec_key_ret
+ leaq 16(%rdx,%rsi,1),%rdi
+
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+ movups %xmm0,(%rdi)
+ movups %xmm1,(%rdx)
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+
+.Ldec_key_inverse:
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+ movups %xmm0,16(%rdi)
+ movups %xmm1,-16(%rdx)
+ cmpq %rdx,%rdi
+ ja .Ldec_key_inverse
+
+ movups (%rdx),%xmm0
+.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
+ movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
+.Ldec_key_ret:
+ addq $8,%rsp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_endproc
+.LSEH_end_set_decrypt_key:
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,@function
+.align 16
+aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+.cfi_startproc
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,BORINGSSL_function_hit+3(%rip)
+#endif
+.byte 0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset 8
+ movq $-1,%rax
+ testq %rdi,%rdi
+ jz .Lenc_key_ret
+ testq %rdx,%rdx
+ jz .Lenc_key_ret
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq OPENSSL_ia32cap_P(%rip),%r10
+ movl 4(%r10),%r10d
+ andl $268437504,%r10d
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je .L14rounds
+ cmpl $192,%esi
+ je .L12rounds
+ cmpl $128,%esi
+ jne .Lbad_keybits
+
+.L10rounds:
+ movl $9,%esi
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_128_cold
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,64
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,128
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,27
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,54
+ call .Lkey_expansion_128
+ movups %xmm0,(%rax)
+ movl %esi,80(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_192a_cold
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,128
+ call .Lkey_expansion_192b
+ movups %xmm0,(%rax)
+ movl %esi,48(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
+ movups %xmm0,(%rdx)
+ movups %xmm2,16(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_256a_cold
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_256a
+ movups %xmm0,(%rax)
+ movl %esi,16(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ movq $-2,%rax
+.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ addq $8,%rsp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_endproc
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_192a:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%rax)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%rax)
+ leaq 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ movups %xmm2,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_256b:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.section .rodata
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long 6,6,6,0
+.Lincrement64:
+.long 1,0,0,0
+.Lxts_magic:
+.long 0x87,0,1,0
+.Lincrement1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
+
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+.text
+#endif
diff --git a/gen/bcm/aesni-x86_64-win.asm b/gen/bcm/aesni-x86_64-win.asm
new file mode 100644
index 0000000..6c5d9ad
--- /dev/null
+++ b/gen/bcm/aesni-x86_64-win.asm
@@ -0,0 +1,2676 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+EXTERN OPENSSL_ia32cap_P
+global aes_hw_encrypt
+
+ALIGN 16
+aes_hw_encrypt:
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN BORINGSSL_function_hit
+ mov BYTE[((BORINGSSL_function_hit+1))],1
+%endif
+ movups xmm2,XMMWORD[rcx]
+ mov eax,DWORD[240+r8]
+ movups xmm0,XMMWORD[r8]
+ movups xmm1,XMMWORD[16+r8]
+ lea r8,[32+r8]
+ xorps xmm2,xmm0
+$L$oop_enc1_1:
+ DB 102,15,56,220,209
+ dec eax
+ movups xmm1,XMMWORD[r8]
+ lea r8,[16+r8]
+ jnz NEAR $L$oop_enc1_1
+ DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movups XMMWORD[rdx],xmm2
+ pxor xmm2,xmm2
+ ret
+
+
+
+global aes_hw_decrypt
+
+ALIGN 16
+aes_hw_decrypt:
+
+_CET_ENDBR
+ movups xmm2,XMMWORD[rcx]
+ mov eax,DWORD[240+r8]
+ movups xmm0,XMMWORD[r8]
+ movups xmm1,XMMWORD[16+r8]
+ lea r8,[32+r8]
+ xorps xmm2,xmm0
+$L$oop_dec1_2:
+ DB 102,15,56,222,209
+ dec eax
+ movups xmm1,XMMWORD[r8]
+ lea r8,[16+r8]
+ jnz NEAR $L$oop_dec1_2
+ DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movups XMMWORD[rdx],xmm2
+ pxor xmm2,xmm2
+ ret
+
+
+
+ALIGN 16
+_aesni_encrypt2:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ movups xmm0,XMMWORD[32+rcx]
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ add rax,16
+
+$L$enc_loop2:
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$enc_loop2
+
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,221,208
+ DB 102,15,56,221,216
+ ret
+
+
+
+ALIGN 16
+_aesni_decrypt2:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ movups xmm0,XMMWORD[32+rcx]
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ add rax,16
+
+$L$dec_loop2:
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$dec_loop2
+
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,223,208
+ DB 102,15,56,223,216
+ ret
+
+
+
+ALIGN 16
+_aesni_encrypt3:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ xorps xmm4,xmm0
+ movups xmm0,XMMWORD[32+rcx]
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ add rax,16
+
+$L$enc_loop3:
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ DB 102,15,56,220,224
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$enc_loop3
+
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,221,208
+ DB 102,15,56,221,216
+ DB 102,15,56,221,224
+ ret
+
+
+
+ALIGN 16
+_aesni_decrypt3:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ xorps xmm4,xmm0
+ movups xmm0,XMMWORD[32+rcx]
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ add rax,16
+
+$L$dec_loop3:
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$dec_loop3
+
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,223,208
+ DB 102,15,56,223,216
+ DB 102,15,56,223,224
+ ret
+
+
+
+ALIGN 16
+_aesni_encrypt4:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ xorps xmm4,xmm0
+ xorps xmm5,xmm0
+ movups xmm0,XMMWORD[32+rcx]
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ DB 0x0f,0x1f,0x00
+ add rax,16
+
+$L$enc_loop4:
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$enc_loop4
+
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ DB 102,15,56,221,208
+ DB 102,15,56,221,216
+ DB 102,15,56,221,224
+ DB 102,15,56,221,232
+ ret
+
+
+
+ALIGN 16
+_aesni_decrypt4:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ xorps xmm4,xmm0
+ xorps xmm5,xmm0
+ movups xmm0,XMMWORD[32+rcx]
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ DB 0x0f,0x1f,0x00
+ add rax,16
+
+$L$dec_loop4:
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$dec_loop4
+
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,223,208
+ DB 102,15,56,223,216
+ DB 102,15,56,223,224
+ DB 102,15,56,223,232
+ ret
+
+
+
+ALIGN 16
+_aesni_encrypt6:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+ DB 102,15,56,220,209
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ DB 102,15,56,220,217
+ pxor xmm5,xmm0
+ pxor xmm6,xmm0
+ DB 102,15,56,220,225
+ pxor xmm7,xmm0
+ movups xmm0,XMMWORD[rax*1+rcx]
+ add rax,16
+ jmp NEAR $L$enc_loop6_enter
+ALIGN 16
+$L$enc_loop6:
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+$L$enc_loop6_enter:
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$enc_loop6
+
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,15,56,221,208
+ DB 102,15,56,221,216
+ DB 102,15,56,221,224
+ DB 102,15,56,221,232
+ DB 102,15,56,221,240
+ DB 102,15,56,221,248
+ ret
+
+
+
+ALIGN 16
+_aesni_decrypt6:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ pxor xmm3,xmm0
+ pxor xmm4,xmm0
+ DB 102,15,56,222,209
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ DB 102,15,56,222,217
+ pxor xmm5,xmm0
+ pxor xmm6,xmm0
+ DB 102,15,56,222,225
+ pxor xmm7,xmm0
+ movups xmm0,XMMWORD[rax*1+rcx]
+ add rax,16
+ jmp NEAR $L$dec_loop6_enter
+ALIGN 16
+$L$dec_loop6:
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+$L$dec_loop6_enter:
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$dec_loop6
+
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,15,56,223,208
+ DB 102,15,56,223,216
+ DB 102,15,56,223,224
+ DB 102,15,56,223,232
+ DB 102,15,56,223,240
+ DB 102,15,56,223,248
+ ret
+
+
+
+ALIGN 16
+_aesni_encrypt8:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ pxor xmm4,xmm0
+ pxor xmm5,xmm0
+ pxor xmm6,xmm0
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ DB 102,15,56,220,209
+ pxor xmm7,xmm0
+ pxor xmm8,xmm0
+ DB 102,15,56,220,217
+ pxor xmm9,xmm0
+ movups xmm0,XMMWORD[rax*1+rcx]
+ add rax,16
+ jmp NEAR $L$enc_loop8_inner
+ALIGN 16
+$L$enc_loop8:
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+$L$enc_loop8_inner:
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+$L$enc_loop8_enter:
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ DB 102,68,15,56,220,192
+ DB 102,68,15,56,220,200
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$enc_loop8
+
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ DB 102,15,56,221,208
+ DB 102,15,56,221,216
+ DB 102,15,56,221,224
+ DB 102,15,56,221,232
+ DB 102,15,56,221,240
+ DB 102,15,56,221,248
+ DB 102,68,15,56,221,192
+ DB 102,68,15,56,221,200
+ ret
+
+
+
+ALIGN 16
+_aesni_decrypt8:
+
+ movups xmm0,XMMWORD[rcx]
+ shl eax,4
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm2,xmm0
+ xorps xmm3,xmm0
+ pxor xmm4,xmm0
+ pxor xmm5,xmm0
+ pxor xmm6,xmm0
+ lea rcx,[32+rax*1+rcx]
+ neg rax
+ DB 102,15,56,222,209
+ pxor xmm7,xmm0
+ pxor xmm8,xmm0
+ DB 102,15,56,222,217
+ pxor xmm9,xmm0
+ movups xmm0,XMMWORD[rax*1+rcx]
+ add rax,16
+ jmp NEAR $L$dec_loop8_inner
+ALIGN 16
+$L$dec_loop8:
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+$L$dec_loop8_inner:
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+$L$dec_loop8_enter:
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ DB 102,68,15,56,222,192
+ DB 102,68,15,56,222,200
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$dec_loop8
+
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+ DB 102,15,56,223,208
+ DB 102,15,56,223,216
+ DB 102,15,56,223,224
+ DB 102,15,56,223,232
+ DB 102,15,56,223,240
+ DB 102,15,56,223,248
+ DB 102,68,15,56,223,192
+ DB 102,68,15,56,223,200
+ ret
+
+
+global aes_hw_ecb_encrypt
+
+ALIGN 16
+aes_hw_ecb_encrypt:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes_hw_ecb_encrypt:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ lea rsp,[((-88))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+$L$ecb_enc_body:
+ and rdx,-16
+ jz NEAR $L$ecb_ret
+
+ mov eax,DWORD[240+rcx]
+ movups xmm0,XMMWORD[rcx]
+ mov r11,rcx
+ mov r10d,eax
+ test r8d,r8d
+ jz NEAR $L$ecb_decrypt
+
+ cmp rdx,0x80
+ jb NEAR $L$ecb_enc_tail
+
+ movdqu xmm2,XMMWORD[rdi]
+ movdqu xmm3,XMMWORD[16+rdi]
+ movdqu xmm4,XMMWORD[32+rdi]
+ movdqu xmm5,XMMWORD[48+rdi]
+ movdqu xmm6,XMMWORD[64+rdi]
+ movdqu xmm7,XMMWORD[80+rdi]
+ movdqu xmm8,XMMWORD[96+rdi]
+ movdqu xmm9,XMMWORD[112+rdi]
+ lea rdi,[128+rdi]
+ sub rdx,0x80
+ jmp NEAR $L$ecb_enc_loop8_enter
+ALIGN 16
+$L$ecb_enc_loop8:
+ movups XMMWORD[rsi],xmm2
+ mov rcx,r11
+ movdqu xmm2,XMMWORD[rdi]
+ mov eax,r10d
+ movups XMMWORD[16+rsi],xmm3
+ movdqu xmm3,XMMWORD[16+rdi]
+ movups XMMWORD[32+rsi],xmm4
+ movdqu xmm4,XMMWORD[32+rdi]
+ movups XMMWORD[48+rsi],xmm5
+ movdqu xmm5,XMMWORD[48+rdi]
+ movups XMMWORD[64+rsi],xmm6
+ movdqu xmm6,XMMWORD[64+rdi]
+ movups XMMWORD[80+rsi],xmm7
+ movdqu xmm7,XMMWORD[80+rdi]
+ movups XMMWORD[96+rsi],xmm8
+ movdqu xmm8,XMMWORD[96+rdi]
+ movups XMMWORD[112+rsi],xmm9
+ lea rsi,[128+rsi]
+ movdqu xmm9,XMMWORD[112+rdi]
+ lea rdi,[128+rdi]
+$L$ecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ sub rdx,0x80
+ jnc NEAR $L$ecb_enc_loop8
+
+ movups XMMWORD[rsi],xmm2
+ mov rcx,r11
+ movups XMMWORD[16+rsi],xmm3
+ mov eax,r10d
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+ movups XMMWORD[64+rsi],xmm6
+ movups XMMWORD[80+rsi],xmm7
+ movups XMMWORD[96+rsi],xmm8
+ movups XMMWORD[112+rsi],xmm9
+ lea rsi,[128+rsi]
+ add rdx,0x80
+ jz NEAR $L$ecb_ret
+
+$L$ecb_enc_tail:
+ movups xmm2,XMMWORD[rdi]
+ cmp rdx,0x20
+ jb NEAR $L$ecb_enc_one
+ movups xmm3,XMMWORD[16+rdi]
+ je NEAR $L$ecb_enc_two
+ movups xmm4,XMMWORD[32+rdi]
+ cmp rdx,0x40
+ jb NEAR $L$ecb_enc_three
+ movups xmm5,XMMWORD[48+rdi]
+ je NEAR $L$ecb_enc_four
+ movups xmm6,XMMWORD[64+rdi]
+ cmp rdx,0x60
+ jb NEAR $L$ecb_enc_five
+ movups xmm7,XMMWORD[80+rdi]
+ je NEAR $L$ecb_enc_six
+ movdqu xmm8,XMMWORD[96+rdi]
+ xorps xmm9,xmm9
+ call _aesni_encrypt8
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+ movups XMMWORD[64+rsi],xmm6
+ movups XMMWORD[80+rsi],xmm7
+ movups XMMWORD[96+rsi],xmm8
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_enc_one:
+ movups xmm0,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+rcx]
+ lea rcx,[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_enc1_3:
+ DB 102,15,56,220,209
+ dec eax
+ movups xmm1,XMMWORD[rcx]
+ lea rcx,[16+rcx]
+ jnz NEAR $L$oop_enc1_3
+ DB 102,15,56,221,209
+ movups XMMWORD[rsi],xmm2
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_enc_two:
+ call _aesni_encrypt2
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_enc_three:
+ call _aesni_encrypt3
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_enc_four:
+ call _aesni_encrypt4
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_enc_five:
+ xorps xmm7,xmm7
+ call _aesni_encrypt6
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+ movups XMMWORD[64+rsi],xmm6
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_enc_six:
+ call _aesni_encrypt6
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+ movups XMMWORD[64+rsi],xmm6
+ movups XMMWORD[80+rsi],xmm7
+ jmp NEAR $L$ecb_ret
+
+ALIGN 16
+$L$ecb_decrypt:
+ cmp rdx,0x80
+ jb NEAR $L$ecb_dec_tail
+
+ movdqu xmm2,XMMWORD[rdi]
+ movdqu xmm3,XMMWORD[16+rdi]
+ movdqu xmm4,XMMWORD[32+rdi]
+ movdqu xmm5,XMMWORD[48+rdi]
+ movdqu xmm6,XMMWORD[64+rdi]
+ movdqu xmm7,XMMWORD[80+rdi]
+ movdqu xmm8,XMMWORD[96+rdi]
+ movdqu xmm9,XMMWORD[112+rdi]
+ lea rdi,[128+rdi]
+ sub rdx,0x80
+ jmp NEAR $L$ecb_dec_loop8_enter
+ALIGN 16
+$L$ecb_dec_loop8:
+ movups XMMWORD[rsi],xmm2
+ mov rcx,r11
+ movdqu xmm2,XMMWORD[rdi]
+ mov eax,r10d
+ movups XMMWORD[16+rsi],xmm3
+ movdqu xmm3,XMMWORD[16+rdi]
+ movups XMMWORD[32+rsi],xmm4
+ movdqu xmm4,XMMWORD[32+rdi]
+ movups XMMWORD[48+rsi],xmm5
+ movdqu xmm5,XMMWORD[48+rdi]
+ movups XMMWORD[64+rsi],xmm6
+ movdqu xmm6,XMMWORD[64+rdi]
+ movups XMMWORD[80+rsi],xmm7
+ movdqu xmm7,XMMWORD[80+rdi]
+ movups XMMWORD[96+rsi],xmm8
+ movdqu xmm8,XMMWORD[96+rdi]
+ movups XMMWORD[112+rsi],xmm9
+ lea rsi,[128+rsi]
+ movdqu xmm9,XMMWORD[112+rdi]
+ lea rdi,[128+rdi]
+$L$ecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ movups xmm0,XMMWORD[r11]
+ sub rdx,0x80
+ jnc NEAR $L$ecb_dec_loop8
+
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ mov rcx,r11
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ mov eax,r10d
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ movups XMMWORD[64+rsi],xmm6
+ pxor xmm6,xmm6
+ movups XMMWORD[80+rsi],xmm7
+ pxor xmm7,xmm7
+ movups XMMWORD[96+rsi],xmm8
+ pxor xmm8,xmm8
+ movups XMMWORD[112+rsi],xmm9
+ pxor xmm9,xmm9
+ lea rsi,[128+rsi]
+ add rdx,0x80
+ jz NEAR $L$ecb_ret
+
+$L$ecb_dec_tail:
+ movups xmm2,XMMWORD[rdi]
+ cmp rdx,0x20
+ jb NEAR $L$ecb_dec_one
+ movups xmm3,XMMWORD[16+rdi]
+ je NEAR $L$ecb_dec_two
+ movups xmm4,XMMWORD[32+rdi]
+ cmp rdx,0x40
+ jb NEAR $L$ecb_dec_three
+ movups xmm5,XMMWORD[48+rdi]
+ je NEAR $L$ecb_dec_four
+ movups xmm6,XMMWORD[64+rdi]
+ cmp rdx,0x60
+ jb NEAR $L$ecb_dec_five
+ movups xmm7,XMMWORD[80+rdi]
+ je NEAR $L$ecb_dec_six
+ movups xmm8,XMMWORD[96+rdi]
+ movups xmm0,XMMWORD[rcx]
+ xorps xmm9,xmm9
+ call _aesni_decrypt8
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ movups XMMWORD[64+rsi],xmm6
+ pxor xmm6,xmm6
+ movups XMMWORD[80+rsi],xmm7
+ pxor xmm7,xmm7
+ movups XMMWORD[96+rsi],xmm8
+ pxor xmm8,xmm8
+ pxor xmm9,xmm9
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_dec_one:
+ movups xmm0,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+rcx]
+ lea rcx,[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_dec1_4:
+ DB 102,15,56,222,209
+ dec eax
+ movups xmm1,XMMWORD[rcx]
+ lea rcx,[16+rcx]
+ jnz NEAR $L$oop_dec1_4
+ DB 102,15,56,223,209
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_dec_two:
+ call _aesni_decrypt2
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_dec_three:
+ call _aesni_decrypt3
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_dec_four:
+ call _aesni_decrypt4
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_dec_five:
+ xorps xmm7,xmm7
+ call _aesni_decrypt6
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ movups XMMWORD[64+rsi],xmm6
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+ jmp NEAR $L$ecb_ret
+ALIGN 16
+$L$ecb_dec_six:
+ call _aesni_decrypt6
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ movups XMMWORD[64+rsi],xmm6
+ pxor xmm6,xmm6
+ movups XMMWORD[80+rsi],xmm7
+ pxor xmm7,xmm7
+
+$L$ecb_ret:
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ movaps xmm6,XMMWORD[rsp]
+ movaps XMMWORD[rsp],xmm0
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps XMMWORD[16+rsp],xmm0
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps XMMWORD[32+rsp],xmm0
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps XMMWORD[48+rsp],xmm0
+ lea rsp,[88+rsp]
+$L$ecb_enc_ret:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes_hw_ecb_encrypt:
+global aes_hw_ctr32_encrypt_blocks
+
+ALIGN 16
+aes_hw_ctr32_encrypt_blocks:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes_hw_ctr32_encrypt_blocks:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+ mov BYTE[BORINGSSL_function_hit],1
+%endif
+ cmp rdx,1
+ jne NEAR $L$ctr32_bulk
+
+
+
+ movups xmm2,XMMWORD[r8]
+ movups xmm3,XMMWORD[rdi]
+ mov edx,DWORD[240+rcx]
+ movups xmm0,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+rcx]
+ lea rcx,[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_enc1_5:
+ DB 102,15,56,220,209
+ dec edx
+ movups xmm1,XMMWORD[rcx]
+ lea rcx,[16+rcx]
+ jnz NEAR $L$oop_enc1_5
+ DB 102,15,56,221,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ xorps xmm2,xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD[rsi],xmm2
+ xorps xmm2,xmm2
+ jmp NEAR $L$ctr32_epilogue
+
+ALIGN 16
+$L$ctr32_bulk:
+ lea r11,[rsp]
+
+ push rbp
+
+ sub rsp,288
+ and rsp,-16
+ movaps XMMWORD[(-168)+r11],xmm6
+ movaps XMMWORD[(-152)+r11],xmm7
+ movaps XMMWORD[(-136)+r11],xmm8
+ movaps XMMWORD[(-120)+r11],xmm9
+ movaps XMMWORD[(-104)+r11],xmm10
+ movaps XMMWORD[(-88)+r11],xmm11
+ movaps XMMWORD[(-72)+r11],xmm12
+ movaps XMMWORD[(-56)+r11],xmm13
+ movaps XMMWORD[(-40)+r11],xmm14
+ movaps XMMWORD[(-24)+r11],xmm15
+$L$ctr32_body:
+
+
+
+
+ movdqu xmm2,XMMWORD[r8]
+ movdqu xmm0,XMMWORD[rcx]
+ mov r8d,DWORD[12+r8]
+ pxor xmm2,xmm0
+ mov ebp,DWORD[12+rcx]
+ movdqa XMMWORD[rsp],xmm2
+ bswap r8d
+ movdqa xmm3,xmm2
+ movdqa xmm4,xmm2
+ movdqa xmm5,xmm2
+ movdqa XMMWORD[64+rsp],xmm2
+ movdqa XMMWORD[80+rsp],xmm2
+ movdqa XMMWORD[96+rsp],xmm2
+ mov r10,rdx
+ movdqa XMMWORD[112+rsp],xmm2
+
+ lea rax,[1+r8]
+ lea rdx,[2+r8]
+ bswap eax
+ bswap edx
+ xor eax,ebp
+ xor edx,ebp
+DB 102,15,58,34,216,3
+ lea rax,[3+r8]
+ movdqa XMMWORD[16+rsp],xmm3
+DB 102,15,58,34,226,3
+ bswap eax
+ mov rdx,r10
+ lea r10,[4+r8]
+ movdqa XMMWORD[32+rsp],xmm4
+ xor eax,ebp
+ bswap r10d
+DB 102,15,58,34,232,3
+ xor r10d,ebp
+ movdqa XMMWORD[48+rsp],xmm5
+ lea r9,[5+r8]
+ mov DWORD[((64+12))+rsp],r10d
+ bswap r9d
+ lea r10,[6+r8]
+ mov eax,DWORD[240+rcx]
+ xor r9d,ebp
+ bswap r10d
+ mov DWORD[((80+12))+rsp],r9d
+ xor r10d,ebp
+ lea r9,[7+r8]
+ mov DWORD[((96+12))+rsp],r10d
+ bswap r9d
+ xor r9d,ebp
+ mov DWORD[((112+12))+rsp],r9d
+
+ movups xmm1,XMMWORD[16+rcx]
+
+ movdqa xmm6,XMMWORD[64+rsp]
+ movdqa xmm7,XMMWORD[80+rsp]
+
+ cmp rdx,8
+ jb NEAR $L$ctr32_tail
+
+ lea rcx,[128+rcx]
+ sub rdx,8
+ jmp NEAR $L$ctr32_loop8
+
+ALIGN 32
+$L$ctr32_loop8:
+ add r8d,8
+ movdqa xmm8,XMMWORD[96+rsp]
+ DB 102,15,56,220,209
+ mov r9d,r8d
+ movdqa xmm9,XMMWORD[112+rsp]
+ DB 102,15,56,220,217
+ bswap r9d
+ movups xmm0,XMMWORD[((32-128))+rcx]
+ DB 102,15,56,220,225
+ xor r9d,ebp
+ nop
+ DB 102,15,56,220,233
+ mov DWORD[((0+12))+rsp],r9d
+ lea r9,[1+r8]
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ movups xmm1,XMMWORD[((48-128))+rcx]
+ bswap r9d
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ xor r9d,ebp
+ DB 0x66,0x90
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ mov DWORD[((16+12))+rsp],r9d
+ lea r9,[2+r8]
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ DB 102,68,15,56,220,192
+ DB 102,68,15,56,220,200
+ movups xmm0,XMMWORD[((64-128))+rcx]
+ bswap r9d
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ xor r9d,ebp
+ DB 0x66,0x90
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ mov DWORD[((32+12))+rsp],r9d
+ lea r9,[3+r8]
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ movups xmm1,XMMWORD[((80-128))+rcx]
+ bswap r9d
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ xor r9d,ebp
+ DB 0x66,0x90
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ mov DWORD[((48+12))+rsp],r9d
+ lea r9,[4+r8]
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ DB 102,68,15,56,220,192
+ DB 102,68,15,56,220,200
+ movups xmm0,XMMWORD[((96-128))+rcx]
+ bswap r9d
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ xor r9d,ebp
+ DB 0x66,0x90
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ mov DWORD[((64+12))+rsp],r9d
+ lea r9,[5+r8]
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ movups xmm1,XMMWORD[((112-128))+rcx]
+ bswap r9d
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ xor r9d,ebp
+ DB 0x66,0x90
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ mov DWORD[((80+12))+rsp],r9d
+ lea r9,[6+r8]
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ DB 102,68,15,56,220,192
+ DB 102,68,15,56,220,200
+ movups xmm0,XMMWORD[((128-128))+rcx]
+ bswap r9d
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ xor r9d,ebp
+ DB 0x66,0x90
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ mov DWORD[((96+12))+rsp],r9d
+ lea r9,[7+r8]
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ movups xmm1,XMMWORD[((144-128))+rcx]
+ bswap r9d
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ DB 102,15,56,220,224
+ xor r9d,ebp
+ movdqu xmm10,XMMWORD[rdi]
+ DB 102,15,56,220,232
+ mov DWORD[((112+12))+rsp],r9d
+ cmp eax,11
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ DB 102,68,15,56,220,192
+ DB 102,68,15,56,220,200
+ movups xmm0,XMMWORD[((160-128))+rcx]
+
+ jb NEAR $L$ctr32_enc_done
+
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ movups xmm1,XMMWORD[((176-128))+rcx]
+
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ DB 102,68,15,56,220,192
+ DB 102,68,15,56,220,200
+ movups xmm0,XMMWORD[((192-128))+rcx]
+ je NEAR $L$ctr32_enc_done
+
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ movups xmm1,XMMWORD[((208-128))+rcx]
+
+ DB 102,15,56,220,208
+ DB 102,15,56,220,216
+ DB 102,15,56,220,224
+ DB 102,15,56,220,232
+ DB 102,15,56,220,240
+ DB 102,15,56,220,248
+ DB 102,68,15,56,220,192
+ DB 102,68,15,56,220,200
+ movups xmm0,XMMWORD[((224-128))+rcx]
+ jmp NEAR $L$ctr32_enc_done
+
+ALIGN 16
+$L$ctr32_enc_done:
+ movdqu xmm11,XMMWORD[16+rdi]
+ pxor xmm10,xmm0
+ movdqu xmm12,XMMWORD[32+rdi]
+ pxor xmm11,xmm0
+ movdqu xmm13,XMMWORD[48+rdi]
+ pxor xmm12,xmm0
+ movdqu xmm14,XMMWORD[64+rdi]
+ pxor xmm13,xmm0
+ movdqu xmm15,XMMWORD[80+rdi]
+ pxor xmm14,xmm0
+ prefetcht0 [448+rdi]
+ prefetcht0 [512+rdi]
+ pxor xmm15,xmm0
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+ DB 102,68,15,56,220,201
+ movdqu xmm1,XMMWORD[96+rdi]
+ lea rdi,[128+rdi]
+
+ DB 102,65,15,56,221,210
+ pxor xmm1,xmm0
+ movdqu xmm10,XMMWORD[((112-128))+rdi]
+ DB 102,65,15,56,221,219
+ pxor xmm10,xmm0
+ movdqa xmm11,XMMWORD[rsp]
+ DB 102,65,15,56,221,228
+ DB 102,65,15,56,221,237
+ movdqa xmm12,XMMWORD[16+rsp]
+ movdqa xmm13,XMMWORD[32+rsp]
+ DB 102,65,15,56,221,246
+ DB 102,65,15,56,221,255
+ movdqa xmm14,XMMWORD[48+rsp]
+ movdqa xmm15,XMMWORD[64+rsp]
+ DB 102,68,15,56,221,193
+ movdqa xmm0,XMMWORD[80+rsp]
+ movups xmm1,XMMWORD[((16-128))+rcx]
+ DB 102,69,15,56,221,202
+
+ movups XMMWORD[rsi],xmm2
+ movdqa xmm2,xmm11
+ movups XMMWORD[16+rsi],xmm3
+ movdqa xmm3,xmm12
+ movups XMMWORD[32+rsi],xmm4
+ movdqa xmm4,xmm13
+ movups XMMWORD[48+rsi],xmm5
+ movdqa xmm5,xmm14
+ movups XMMWORD[64+rsi],xmm6
+ movdqa xmm6,xmm15
+ movups XMMWORD[80+rsi],xmm7
+ movdqa xmm7,xmm0
+ movups XMMWORD[96+rsi],xmm8
+ movups XMMWORD[112+rsi],xmm9
+ lea rsi,[128+rsi]
+
+ sub rdx,8
+ jnc NEAR $L$ctr32_loop8
+
+ add rdx,8
+ jz NEAR $L$ctr32_done
+ lea rcx,[((-128))+rcx]
+
+$L$ctr32_tail:
+
+
+ lea rcx,[16+rcx]
+ cmp rdx,4
+ jb NEAR $L$ctr32_loop3
+ je NEAR $L$ctr32_loop4
+
+
+ shl eax,4
+ movdqa xmm8,XMMWORD[96+rsp]
+ pxor xmm9,xmm9
+
+ movups xmm0,XMMWORD[16+rcx]
+ DB 102,15,56,220,209
+ DB 102,15,56,220,217
+ lea rcx,[((32-16))+rax*1+rcx]
+ neg rax
+ DB 102,15,56,220,225
+ add rax,16
+ movups xmm10,XMMWORD[rdi]
+ DB 102,15,56,220,233
+ DB 102,15,56,220,241
+ movups xmm11,XMMWORD[16+rdi]
+ movups xmm12,XMMWORD[32+rdi]
+ DB 102,15,56,220,249
+ DB 102,68,15,56,220,193
+
+ call $L$enc_loop8_enter
+
+ movdqu xmm13,XMMWORD[48+rdi]
+ pxor xmm2,xmm10
+ movdqu xmm10,XMMWORD[64+rdi]
+ pxor xmm3,xmm11
+ movdqu XMMWORD[rsi],xmm2
+ pxor xmm4,xmm12
+ movdqu XMMWORD[16+rsi],xmm3
+ pxor xmm5,xmm13
+ movdqu XMMWORD[32+rsi],xmm4
+ pxor xmm6,xmm10
+ movdqu XMMWORD[48+rsi],xmm5
+ movdqu XMMWORD[64+rsi],xmm6
+ cmp rdx,6
+ jb NEAR $L$ctr32_done
+
+ movups xmm11,XMMWORD[80+rdi]
+ xorps xmm7,xmm11
+ movups XMMWORD[80+rsi],xmm7
+ je NEAR $L$ctr32_done
+
+ movups xmm12,XMMWORD[96+rdi]
+ xorps xmm8,xmm12
+ movups XMMWORD[96+rsi],xmm8
+ jmp NEAR $L$ctr32_done
+
+ALIGN 32
+$L$ctr32_loop4:
+ DB 102,15,56,220,209
+ lea rcx,[16+rcx]
+ dec eax
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ DB 102,15,56,220,233
+ movups xmm1,XMMWORD[rcx]
+ jnz NEAR $L$ctr32_loop4
+ DB 102,15,56,221,209
+ DB 102,15,56,221,217
+ movups xmm10,XMMWORD[rdi]
+ movups xmm11,XMMWORD[16+rdi]
+ DB 102,15,56,221,225
+ DB 102,15,56,221,233
+ movups xmm12,XMMWORD[32+rdi]
+ movups xmm13,XMMWORD[48+rdi]
+
+ xorps xmm2,xmm10
+ movups XMMWORD[rsi],xmm2
+ xorps xmm3,xmm11
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm4,xmm12
+ movdqu XMMWORD[32+rsi],xmm4
+ pxor xmm5,xmm13
+ movdqu XMMWORD[48+rsi],xmm5
+ jmp NEAR $L$ctr32_done
+
+ALIGN 32
+$L$ctr32_loop3:
+ DB 102,15,56,220,209
+ lea rcx,[16+rcx]
+ dec eax
+ DB 102,15,56,220,217
+ DB 102,15,56,220,225
+ movups xmm1,XMMWORD[rcx]
+ jnz NEAR $L$ctr32_loop3
+ DB 102,15,56,221,209
+ DB 102,15,56,221,217
+ DB 102,15,56,221,225
+
+ movups xmm10,XMMWORD[rdi]
+ xorps xmm2,xmm10
+ movups XMMWORD[rsi],xmm2
+ cmp rdx,2
+ jb NEAR $L$ctr32_done
+
+ movups xmm11,XMMWORD[16+rdi]
+ xorps xmm3,xmm11
+ movups XMMWORD[16+rsi],xmm3
+ je NEAR $L$ctr32_done
+
+ movups xmm12,XMMWORD[32+rdi]
+ xorps xmm4,xmm12
+ movups XMMWORD[32+rsi],xmm4
+
+$L$ctr32_done:
+ xorps xmm0,xmm0
+ xor ebp,ebp
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movaps xmm6,XMMWORD[((-168))+r11]
+ movaps XMMWORD[(-168)+r11],xmm0
+ movaps xmm7,XMMWORD[((-152))+r11]
+ movaps XMMWORD[(-152)+r11],xmm0
+ movaps xmm8,XMMWORD[((-136))+r11]
+ movaps XMMWORD[(-136)+r11],xmm0
+ movaps xmm9,XMMWORD[((-120))+r11]
+ movaps XMMWORD[(-120)+r11],xmm0
+ movaps xmm10,XMMWORD[((-104))+r11]
+ movaps XMMWORD[(-104)+r11],xmm0
+ movaps xmm11,XMMWORD[((-88))+r11]
+ movaps XMMWORD[(-88)+r11],xmm0
+ movaps xmm12,XMMWORD[((-72))+r11]
+ movaps XMMWORD[(-72)+r11],xmm0
+ movaps xmm13,XMMWORD[((-56))+r11]
+ movaps XMMWORD[(-56)+r11],xmm0
+ movaps xmm14,XMMWORD[((-40))+r11]
+ movaps XMMWORD[(-40)+r11],xmm0
+ movaps xmm15,XMMWORD[((-24))+r11]
+ movaps XMMWORD[(-24)+r11],xmm0
+ movaps XMMWORD[rsp],xmm0
+ movaps XMMWORD[16+rsp],xmm0
+ movaps XMMWORD[32+rsp],xmm0
+ movaps XMMWORD[48+rsp],xmm0
+ movaps XMMWORD[64+rsp],xmm0
+ movaps XMMWORD[80+rsp],xmm0
+ movaps XMMWORD[96+rsp],xmm0
+ movaps XMMWORD[112+rsp],xmm0
+ mov rbp,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$ctr32_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes_hw_ctr32_encrypt_blocks:
+global aes_hw_cbc_encrypt
+
+ALIGN 16
+aes_hw_cbc_encrypt:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes_hw_cbc_encrypt:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ test rdx,rdx
+ jz NEAR $L$cbc_ret
+
+ mov r10d,DWORD[240+rcx]
+ mov r11,rcx
+ test r9d,r9d
+ jz NEAR $L$cbc_decrypt
+
+ movups xmm2,XMMWORD[r8]
+ mov eax,r10d
+ cmp rdx,16
+ jb NEAR $L$cbc_enc_tail
+ sub rdx,16
+ jmp NEAR $L$cbc_enc_loop
+ALIGN 16
+$L$cbc_enc_loop:
+ movups xmm3,XMMWORD[rdi]
+ lea rdi,[16+rdi]
+
+ movups xmm0,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+rcx]
+ xorps xmm3,xmm0
+ lea rcx,[32+rcx]
+ xorps xmm2,xmm3
+$L$oop_enc1_6:
+ DB 102,15,56,220,209
+ dec eax
+ movups xmm1,XMMWORD[rcx]
+ lea rcx,[16+rcx]
+ jnz NEAR $L$oop_enc1_6
+ DB 102,15,56,221,209
+ mov eax,r10d
+ mov rcx,r11
+ movups XMMWORD[rsi],xmm2
+ lea rsi,[16+rsi]
+ sub rdx,16
+ jnc NEAR $L$cbc_enc_loop
+ add rdx,16
+ jnz NEAR $L$cbc_enc_tail
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movups XMMWORD[r8],xmm2
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ jmp NEAR $L$cbc_ret
+
+$L$cbc_enc_tail:
+ mov rcx,rdx
+ xchg rsi,rdi
+ DD 0x9066A4F3
+ mov ecx,16
+ sub rcx,rdx
+ xor eax,eax
+ DD 0x9066AAF3
+ lea rdi,[((-16))+rdi]
+ mov eax,r10d
+ mov rsi,rdi
+ mov rcx,r11
+ xor rdx,rdx
+ jmp NEAR $L$cbc_enc_loop
+
+ALIGN 16
+$L$cbc_decrypt:
+ cmp rdx,16
+ jne NEAR $L$cbc_decrypt_bulk
+
+
+
+ movdqu xmm2,XMMWORD[rdi]
+ movdqu xmm3,XMMWORD[r8]
+ movdqa xmm4,xmm2
+ movups xmm0,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+rcx]
+ lea rcx,[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_dec1_7:
+ DB 102,15,56,222,209
+ dec r10d
+ movups xmm1,XMMWORD[rcx]
+ lea rcx,[16+rcx]
+ jnz NEAR $L$oop_dec1_7
+ DB 102,15,56,223,209
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ movdqu XMMWORD[r8],xmm4
+ xorps xmm2,xmm3
+ pxor xmm3,xmm3
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ jmp NEAR $L$cbc_ret
+ALIGN 16
+$L$cbc_decrypt_bulk:
+ lea r11,[rsp]
+
+ push rbp
+
+ sub rsp,176
+ and rsp,-16
+ movaps XMMWORD[16+rsp],xmm6
+ movaps XMMWORD[32+rsp],xmm7
+ movaps XMMWORD[48+rsp],xmm8
+ movaps XMMWORD[64+rsp],xmm9
+ movaps XMMWORD[80+rsp],xmm10
+ movaps XMMWORD[96+rsp],xmm11
+ movaps XMMWORD[112+rsp],xmm12
+ movaps XMMWORD[128+rsp],xmm13
+ movaps XMMWORD[144+rsp],xmm14
+ movaps XMMWORD[160+rsp],xmm15
+$L$cbc_decrypt_body:
+ mov rbp,rcx
+ movups xmm10,XMMWORD[r8]
+ mov eax,r10d
+ cmp rdx,0x50
+ jbe NEAR $L$cbc_dec_tail
+
+ movups xmm0,XMMWORD[rcx]
+ movdqu xmm2,XMMWORD[rdi]
+ movdqu xmm3,XMMWORD[16+rdi]
+ movdqa xmm11,xmm2
+ movdqu xmm4,XMMWORD[32+rdi]
+ movdqa xmm12,xmm3
+ movdqu xmm5,XMMWORD[48+rdi]
+ movdqa xmm13,xmm4
+ movdqu xmm6,XMMWORD[64+rdi]
+ movdqa xmm14,xmm5
+ movdqu xmm7,XMMWORD[80+rdi]
+ movdqa xmm15,xmm6
+ cmp rdx,0x70
+ jbe NEAR $L$cbc_dec_six_or_seven
+
+ sub rdx,0x70
+ lea rcx,[112+rcx]
+ jmp NEAR $L$cbc_dec_loop8_enter
+ALIGN 16
+$L$cbc_dec_loop8:
+ movups XMMWORD[rsi],xmm9
+ lea rsi,[16+rsi]
+$L$cbc_dec_loop8_enter:
+ movdqu xmm8,XMMWORD[96+rdi]
+ pxor xmm2,xmm0
+ movdqu xmm9,XMMWORD[112+rdi]
+ pxor xmm3,xmm0
+ movups xmm1,XMMWORD[((16-112))+rcx]
+ pxor xmm4,xmm0
+ mov rbp,-1
+ cmp rdx,0x70
+ pxor xmm5,xmm0
+ pxor xmm6,xmm0
+ pxor xmm7,xmm0
+ pxor xmm8,xmm0
+
+ DB 102,15,56,222,209
+ pxor xmm9,xmm0
+ movups xmm0,XMMWORD[((32-112))+rcx]
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ adc rbp,0
+ and rbp,128
+ DB 102,68,15,56,222,201
+ add rbp,rdi
+ movups xmm1,XMMWORD[((48-112))+rcx]
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ DB 102,68,15,56,222,192
+ DB 102,68,15,56,222,200
+ movups xmm0,XMMWORD[((64-112))+rcx]
+ nop
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+ movups xmm1,XMMWORD[((80-112))+rcx]
+ nop
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ DB 102,68,15,56,222,192
+ DB 102,68,15,56,222,200
+ movups xmm0,XMMWORD[((96-112))+rcx]
+ nop
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+ movups xmm1,XMMWORD[((112-112))+rcx]
+ nop
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ DB 102,68,15,56,222,192
+ DB 102,68,15,56,222,200
+ movups xmm0,XMMWORD[((128-112))+rcx]
+ nop
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+ movups xmm1,XMMWORD[((144-112))+rcx]
+ cmp eax,11
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ DB 102,68,15,56,222,192
+ DB 102,68,15,56,222,200
+ movups xmm0,XMMWORD[((160-112))+rcx]
+ jb NEAR $L$cbc_dec_done
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+ movups xmm1,XMMWORD[((176-112))+rcx]
+ nop
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ DB 102,68,15,56,222,192
+ DB 102,68,15,56,222,200
+ movups xmm0,XMMWORD[((192-112))+rcx]
+ je NEAR $L$cbc_dec_done
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+ movups xmm1,XMMWORD[((208-112))+rcx]
+ nop
+ DB 102,15,56,222,208
+ DB 102,15,56,222,216
+ DB 102,15,56,222,224
+ DB 102,15,56,222,232
+ DB 102,15,56,222,240
+ DB 102,15,56,222,248
+ DB 102,68,15,56,222,192
+ DB 102,68,15,56,222,200
+ movups xmm0,XMMWORD[((224-112))+rcx]
+ jmp NEAR $L$cbc_dec_done
+ALIGN 16
+$L$cbc_dec_done:
+ DB 102,15,56,222,209
+ DB 102,15,56,222,217
+ pxor xmm10,xmm0
+ pxor xmm11,xmm0
+ DB 102,15,56,222,225
+ DB 102,15,56,222,233
+ pxor xmm12,xmm0
+ pxor xmm13,xmm0
+ DB 102,15,56,222,241
+ DB 102,15,56,222,249
+ pxor xmm14,xmm0
+ pxor xmm15,xmm0
+ DB 102,68,15,56,222,193
+ DB 102,68,15,56,222,201
+ movdqu xmm1,XMMWORD[80+rdi]
+
+ DB 102,65,15,56,223,210
+ movdqu xmm10,XMMWORD[96+rdi]
+ pxor xmm1,xmm0
+ DB 102,65,15,56,223,219
+ pxor xmm10,xmm0
+ movdqu xmm0,XMMWORD[112+rdi]
+ DB 102,65,15,56,223,228
+ lea rdi,[128+rdi]
+ movdqu xmm11,XMMWORD[rbp]
+ DB 102,65,15,56,223,237
+ DB 102,65,15,56,223,246
+ movdqu xmm12,XMMWORD[16+rbp]
+ movdqu xmm13,XMMWORD[32+rbp]
+ DB 102,65,15,56,223,255
+ DB 102,68,15,56,223,193
+ movdqu xmm14,XMMWORD[48+rbp]
+ movdqu xmm15,XMMWORD[64+rbp]
+ DB 102,69,15,56,223,202
+ movdqa xmm10,xmm0
+ movdqu xmm1,XMMWORD[80+rbp]
+ movups xmm0,XMMWORD[((-112))+rcx]
+
+ movups XMMWORD[rsi],xmm2
+ movdqa xmm2,xmm11
+ movups XMMWORD[16+rsi],xmm3
+ movdqa xmm3,xmm12
+ movups XMMWORD[32+rsi],xmm4
+ movdqa xmm4,xmm13
+ movups XMMWORD[48+rsi],xmm5
+ movdqa xmm5,xmm14
+ movups XMMWORD[64+rsi],xmm6
+ movdqa xmm6,xmm15
+ movups XMMWORD[80+rsi],xmm7
+ movdqa xmm7,xmm1
+ movups XMMWORD[96+rsi],xmm8
+ lea rsi,[112+rsi]
+
+ sub rdx,0x80
+ ja NEAR $L$cbc_dec_loop8
+
+ movaps xmm2,xmm9
+ lea rcx,[((-112))+rcx]
+ add rdx,0x70
+ jle NEAR $L$cbc_dec_clear_tail_collected
+ movups XMMWORD[rsi],xmm9
+ lea rsi,[16+rsi]
+ cmp rdx,0x50
+ jbe NEAR $L$cbc_dec_tail
+
+ movaps xmm2,xmm11
+$L$cbc_dec_six_or_seven:
+ cmp rdx,0x60
+ ja NEAR $L$cbc_dec_seven
+
+ movaps xmm8,xmm7
+ call _aesni_decrypt6
+ pxor xmm2,xmm10
+ movaps xmm10,xmm8
+ pxor xmm3,xmm11
+ movdqu XMMWORD[rsi],xmm2
+ pxor xmm4,xmm12
+ movdqu XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ pxor xmm5,xmm13
+ movdqu XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ pxor xmm6,xmm14
+ movdqu XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ pxor xmm7,xmm15
+ movdqu XMMWORD[64+rsi],xmm6
+ pxor xmm6,xmm6
+ lea rsi,[80+rsi]
+ movdqa xmm2,xmm7
+ pxor xmm7,xmm7
+ jmp NEAR $L$cbc_dec_tail_collected
+
+ALIGN 16
+$L$cbc_dec_seven:
+ movups xmm8,XMMWORD[96+rdi]
+ xorps xmm9,xmm9
+ call _aesni_decrypt8
+ movups xmm9,XMMWORD[80+rdi]
+ pxor xmm2,xmm10
+ movups xmm10,XMMWORD[96+rdi]
+ pxor xmm3,xmm11
+ movdqu XMMWORD[rsi],xmm2
+ pxor xmm4,xmm12
+ movdqu XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ pxor xmm5,xmm13
+ movdqu XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ pxor xmm6,xmm14
+ movdqu XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ pxor xmm7,xmm15
+ movdqu XMMWORD[64+rsi],xmm6
+ pxor xmm6,xmm6
+ pxor xmm8,xmm9
+ movdqu XMMWORD[80+rsi],xmm7
+ pxor xmm7,xmm7
+ lea rsi,[96+rsi]
+ movdqa xmm2,xmm8
+ pxor xmm8,xmm8
+ pxor xmm9,xmm9
+ jmp NEAR $L$cbc_dec_tail_collected
+
+$L$cbc_dec_tail:
+ movups xmm2,XMMWORD[rdi]
+ sub rdx,0x10
+ jbe NEAR $L$cbc_dec_one
+
+ movups xmm3,XMMWORD[16+rdi]
+ movaps xmm11,xmm2
+ sub rdx,0x10
+ jbe NEAR $L$cbc_dec_two
+
+ movups xmm4,XMMWORD[32+rdi]
+ movaps xmm12,xmm3
+ sub rdx,0x10
+ jbe NEAR $L$cbc_dec_three
+
+ movups xmm5,XMMWORD[48+rdi]
+ movaps xmm13,xmm4
+ sub rdx,0x10
+ jbe NEAR $L$cbc_dec_four
+
+ movups xmm6,XMMWORD[64+rdi]
+ movaps xmm14,xmm5
+ movaps xmm15,xmm6
+ xorps xmm7,xmm7
+ call _aesni_decrypt6
+ pxor xmm2,xmm10
+ movaps xmm10,xmm15
+ pxor xmm3,xmm11
+ movdqu XMMWORD[rsi],xmm2
+ pxor xmm4,xmm12
+ movdqu XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ pxor xmm5,xmm13
+ movdqu XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ pxor xmm6,xmm14
+ movdqu XMMWORD[48+rsi],xmm5
+ pxor xmm5,xmm5
+ lea rsi,[64+rsi]
+ movdqa xmm2,xmm6
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+ sub rdx,0x10
+ jmp NEAR $L$cbc_dec_tail_collected
+
+ALIGN 16
+$L$cbc_dec_one:
+ movaps xmm11,xmm2
+ movups xmm0,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+rcx]
+ lea rcx,[32+rcx]
+ xorps xmm2,xmm0
+$L$oop_dec1_8:
+ DB 102,15,56,222,209
+ dec eax
+ movups xmm1,XMMWORD[rcx]
+ lea rcx,[16+rcx]
+ jnz NEAR $L$oop_dec1_8
+ DB 102,15,56,223,209
+ xorps xmm2,xmm10
+ movaps xmm10,xmm11
+ jmp NEAR $L$cbc_dec_tail_collected
+ALIGN 16
+$L$cbc_dec_two:
+ movaps xmm12,xmm3
+ call _aesni_decrypt2
+ pxor xmm2,xmm10
+ movaps xmm10,xmm12
+ pxor xmm3,xmm11
+ movdqu XMMWORD[rsi],xmm2
+ movdqa xmm2,xmm3
+ pxor xmm3,xmm3
+ lea rsi,[16+rsi]
+ jmp NEAR $L$cbc_dec_tail_collected
+ALIGN 16
+$L$cbc_dec_three:
+ movaps xmm13,xmm4
+ call _aesni_decrypt3
+ pxor xmm2,xmm10
+ movaps xmm10,xmm13
+ pxor xmm3,xmm11
+ movdqu XMMWORD[rsi],xmm2
+ pxor xmm4,xmm12
+ movdqu XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ movdqa xmm2,xmm4
+ pxor xmm4,xmm4
+ lea rsi,[32+rsi]
+ jmp NEAR $L$cbc_dec_tail_collected
+ALIGN 16
+$L$cbc_dec_four:
+ movaps xmm14,xmm5
+ call _aesni_decrypt4
+ pxor xmm2,xmm10
+ movaps xmm10,xmm14
+ pxor xmm3,xmm11
+ movdqu XMMWORD[rsi],xmm2
+ pxor xmm4,xmm12
+ movdqu XMMWORD[16+rsi],xmm3
+ pxor xmm3,xmm3
+ pxor xmm5,xmm13
+ movdqu XMMWORD[32+rsi],xmm4
+ pxor xmm4,xmm4
+ movdqa xmm2,xmm5
+ pxor xmm5,xmm5
+ lea rsi,[48+rsi]
+ jmp NEAR $L$cbc_dec_tail_collected
+
+ALIGN 16
+$L$cbc_dec_clear_tail_collected:
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+$L$cbc_dec_tail_collected:
+ movups XMMWORD[r8],xmm10
+ and rdx,15
+ jnz NEAR $L$cbc_dec_tail_partial
+ movups XMMWORD[rsi],xmm2
+ pxor xmm2,xmm2
+ jmp NEAR $L$cbc_dec_ret
+ALIGN 16
+$L$cbc_dec_tail_partial:
+ movaps XMMWORD[rsp],xmm2
+ pxor xmm2,xmm2
+ mov rcx,16
+ mov rdi,rsi
+ sub rcx,rdx
+ lea rsi,[rsp]
+ DD 0x9066A4F3
+ movdqa XMMWORD[rsp],xmm2
+
+$L$cbc_dec_ret:
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ movaps xmm6,XMMWORD[16+rsp]
+ movaps XMMWORD[16+rsp],xmm0
+ movaps xmm7,XMMWORD[32+rsp]
+ movaps XMMWORD[32+rsp],xmm0
+ movaps xmm8,XMMWORD[48+rsp]
+ movaps XMMWORD[48+rsp],xmm0
+ movaps xmm9,XMMWORD[64+rsp]
+ movaps XMMWORD[64+rsp],xmm0
+ movaps xmm10,XMMWORD[80+rsp]
+ movaps XMMWORD[80+rsp],xmm0
+ movaps xmm11,XMMWORD[96+rsp]
+ movaps XMMWORD[96+rsp],xmm0
+ movaps xmm12,XMMWORD[112+rsp]
+ movaps XMMWORD[112+rsp],xmm0
+ movaps xmm13,XMMWORD[128+rsp]
+ movaps XMMWORD[128+rsp],xmm0
+ movaps xmm14,XMMWORD[144+rsp]
+ movaps XMMWORD[144+rsp],xmm0
+ movaps xmm15,XMMWORD[160+rsp]
+ movaps XMMWORD[160+rsp],xmm0
+ mov rbp,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$cbc_ret:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes_hw_cbc_encrypt:
+global aes_hw_set_decrypt_key
+
+ALIGN 16
+aes_hw_set_decrypt_key:
+
+_CET_ENDBR
+ DB 0x48,0x83,0xEC,0x08
+
+ call __aesni_set_encrypt_key
+ shl edx,4
+ test eax,eax
+ jnz NEAR $L$dec_key_ret
+ lea rcx,[16+rdx*1+r8]
+
+ movups xmm0,XMMWORD[r8]
+ movups xmm1,XMMWORD[rcx]
+ movups XMMWORD[rcx],xmm0
+ movups XMMWORD[r8],xmm1
+ lea r8,[16+r8]
+ lea rcx,[((-16))+rcx]
+
+$L$dec_key_inverse:
+ movups xmm0,XMMWORD[r8]
+ movups xmm1,XMMWORD[rcx]
+ DB 102,15,56,219,192
+ DB 102,15,56,219,201
+ lea r8,[16+r8]
+ lea rcx,[((-16))+rcx]
+ movups XMMWORD[16+rcx],xmm0
+ movups XMMWORD[(-16)+r8],xmm1
+ cmp rcx,r8
+ ja NEAR $L$dec_key_inverse
+
+ movups xmm0,XMMWORD[r8]
+ DB 102,15,56,219,192
+ pxor xmm1,xmm1
+ movups XMMWORD[rcx],xmm0
+ pxor xmm0,xmm0
+$L$dec_key_ret:
+ add rsp,8
+
+ ret
+
+$L$SEH_end_set_decrypt_key:
+
+global aes_hw_set_encrypt_key
+
+ALIGN 16
+aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+ mov BYTE[((BORINGSSL_function_hit+3))],1
+%endif
+ DB 0x48,0x83,0xEC,0x08
+
+ mov rax,-1
+ test rcx,rcx
+ jz NEAR $L$enc_key_ret
+ test r8,r8
+ jz NEAR $L$enc_key_ret
+
+ movups xmm0,XMMWORD[rcx]
+ xorps xmm4,xmm4
+ lea r10,[OPENSSL_ia32cap_P]
+ mov r10d,DWORD[4+r10]
+ and r10d,268437504
+ lea rax,[16+r8]
+ cmp edx,256
+ je NEAR $L$14rounds
+ cmp edx,192
+ je NEAR $L$12rounds
+ cmp edx,128
+ jne NEAR $L$bad_keybits
+
+$L$10rounds:
+ mov edx,9
+ cmp r10d,268435456
+ je NEAR $L$10rounds_alt
+
+ movups XMMWORD[r8],xmm0
+ DB 102,15,58,223,200,1
+ call $L$key_expansion_128_cold
+ DB 102,15,58,223,200,2
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,4
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,8
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,16
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,32
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,64
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,128
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,27
+ call $L$key_expansion_128
+ DB 102,15,58,223,200,54
+ call $L$key_expansion_128
+ movups XMMWORD[rax],xmm0
+ mov DWORD[80+rax],edx
+ xor eax,eax
+ jmp NEAR $L$enc_key_ret
+
+ALIGN 16
+$L$10rounds_alt:
+ movdqa xmm5,XMMWORD[$L$key_rotate]
+ mov r10d,8
+ movdqa xmm4,XMMWORD[$L$key_rcon1]
+ movdqa xmm2,xmm0
+ movdqu XMMWORD[r8],xmm0
+ jmp NEAR $L$oop_key128
+
+ALIGN 16
+$L$oop_key128:
+DB 102,15,56,0,197
+ DB 102,15,56,221,196
+ pslld xmm4,1
+ lea rax,[16+rax]
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[(-16)+rax],xmm0
+ movdqa xmm2,xmm0
+
+ dec r10d
+ jnz NEAR $L$oop_key128
+
+ movdqa xmm4,XMMWORD[$L$key_rcon1b]
+
+DB 102,15,56,0,197
+ DB 102,15,56,221,196
+ pslld xmm4,1
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[rax],xmm0
+
+ movdqa xmm2,xmm0
+DB 102,15,56,0,197
+ DB 102,15,56,221,196
+
+ movdqa xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm3,xmm2
+ pslldq xmm2,4
+ pxor xmm2,xmm3
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[16+rax],xmm0
+
+ mov DWORD[96+rax],edx
+ xor eax,eax
+ jmp NEAR $L$enc_key_ret
+
+ALIGN 16
+$L$12rounds:
+ movq xmm2,QWORD[16+rcx]
+ mov edx,11
+ cmp r10d,268435456
+ je NEAR $L$12rounds_alt
+
+ movups XMMWORD[r8],xmm0
+ DB 102,15,58,223,202,1
+ call $L$key_expansion_192a_cold
+ DB 102,15,58,223,202,2
+ call $L$key_expansion_192b
+ DB 102,15,58,223,202,4
+ call $L$key_expansion_192a
+ DB 102,15,58,223,202,8
+ call $L$key_expansion_192b
+ DB 102,15,58,223,202,16
+ call $L$key_expansion_192a
+ DB 102,15,58,223,202,32
+ call $L$key_expansion_192b
+ DB 102,15,58,223,202,64
+ call $L$key_expansion_192a
+ DB 102,15,58,223,202,128
+ call $L$key_expansion_192b
+ movups XMMWORD[rax],xmm0
+ mov DWORD[48+rax],edx
+ xor rax,rax
+ jmp NEAR $L$enc_key_ret
+
+ALIGN 16
+$L$12rounds_alt:
+ movdqa xmm5,XMMWORD[$L$key_rotate192]
+ movdqa xmm4,XMMWORD[$L$key_rcon1]
+ mov r10d,8
+ movdqu XMMWORD[r8],xmm0
+ jmp NEAR $L$oop_key192
+
+ALIGN 16
+$L$oop_key192:
+ movq QWORD[rax],xmm2
+ movdqa xmm1,xmm2
+DB 102,15,56,0,213
+ DB 102,15,56,221,212
+ pslld xmm4,1
+ lea rax,[24+rax]
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+
+ pshufd xmm3,xmm0,0xff
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+
+ pxor xmm0,xmm2
+ pxor xmm2,xmm3
+ movdqu XMMWORD[(-16)+rax],xmm0
+
+ dec r10d
+ jnz NEAR $L$oop_key192
+
+ mov DWORD[32+rax],edx
+ xor eax,eax
+ jmp NEAR $L$enc_key_ret
+
+ALIGN 16
+$L$14rounds:
+ movups xmm2,XMMWORD[16+rcx]
+ mov edx,13
+ lea rax,[16+rax]
+ cmp r10d,268435456
+ je NEAR $L$14rounds_alt
+
+ movups XMMWORD[r8],xmm0
+ movups XMMWORD[16+r8],xmm2
+ DB 102,15,58,223,202,1
+ call $L$key_expansion_256a_cold
+ DB 102,15,58,223,200,1
+ call $L$key_expansion_256b
+ DB 102,15,58,223,202,2
+ call $L$key_expansion_256a
+ DB 102,15,58,223,200,2
+ call $L$key_expansion_256b
+ DB 102,15,58,223,202,4
+ call $L$key_expansion_256a
+ DB 102,15,58,223,200,4
+ call $L$key_expansion_256b
+ DB 102,15,58,223,202,8
+ call $L$key_expansion_256a
+ DB 102,15,58,223,200,8
+ call $L$key_expansion_256b
+ DB 102,15,58,223,202,16
+ call $L$key_expansion_256a
+ DB 102,15,58,223,200,16
+ call $L$key_expansion_256b
+ DB 102,15,58,223,202,32
+ call $L$key_expansion_256a
+ DB 102,15,58,223,200,32
+ call $L$key_expansion_256b
+ DB 102,15,58,223,202,64
+ call $L$key_expansion_256a
+ movups XMMWORD[rax],xmm0
+ mov DWORD[16+rax],edx
+ xor rax,rax
+ jmp NEAR $L$enc_key_ret
+
+ALIGN 16
+$L$14rounds_alt:
+ movdqa xmm5,XMMWORD[$L$key_rotate]
+ movdqa xmm4,XMMWORD[$L$key_rcon1]
+ mov r10d,7
+ movdqu XMMWORD[r8],xmm0
+ movdqa xmm1,xmm2
+ movdqu XMMWORD[16+r8],xmm2
+ jmp NEAR $L$oop_key256
+
+ALIGN 16
+$L$oop_key256:
+DB 102,15,56,0,213
+ DB 102,15,56,221,212
+
+ movdqa xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm3,xmm0
+ pslldq xmm0,4
+ pxor xmm0,xmm3
+ pslld xmm4,1
+
+ pxor xmm0,xmm2
+ movdqu XMMWORD[rax],xmm0
+
+ dec r10d
+ jz NEAR $L$done_key256
+
+ pshufd xmm2,xmm0,0xff
+ pxor xmm3,xmm3
+ DB 102,15,56,221,211
+
+ movdqa xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm3,xmm1
+ pslldq xmm1,4
+ pxor xmm1,xmm3
+
+ pxor xmm2,xmm1
+ movdqu XMMWORD[16+rax],xmm2
+ lea rax,[32+rax]
+ movdqa xmm1,xmm2
+
+ jmp NEAR $L$oop_key256
+
+$L$done_key256:
+ mov DWORD[16+rax],edx
+ xor eax,eax
+ jmp NEAR $L$enc_key_ret
+
+ALIGN 16
+$L$bad_keybits:
+ mov rax,-2
+$L$enc_key_ret:
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ add rsp,8
+
+ ret
+
+$L$SEH_end_set_encrypt_key:
+
+ALIGN 16
+$L$key_expansion_128:
+ movups XMMWORD[rax],xmm0
+ lea rax,[16+rax]
+$L$key_expansion_128_cold:
+ shufps xmm4,xmm0,16
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ xorps xmm0,xmm4
+ shufps xmm1,xmm1,255
+ xorps xmm0,xmm1
+ ret
+
+ALIGN 16
+$L$key_expansion_192a:
+ movups XMMWORD[rax],xmm0
+ lea rax,[16+rax]
+$L$key_expansion_192a_cold:
+ movaps xmm5,xmm2
+$L$key_expansion_192b_warm:
+ shufps xmm4,xmm0,16
+ movdqa xmm3,xmm2
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ pslldq xmm3,4
+ xorps xmm0,xmm4
+ pshufd xmm1,xmm1,85
+ pxor xmm2,xmm3
+ pxor xmm0,xmm1
+ pshufd xmm3,xmm0,255
+ pxor xmm2,xmm3
+ ret
+
+ALIGN 16
+$L$key_expansion_192b:
+ movaps xmm3,xmm0
+ shufps xmm5,xmm0,68
+ movups XMMWORD[rax],xmm5
+ shufps xmm3,xmm2,78
+ movups XMMWORD[16+rax],xmm3
+ lea rax,[32+rax]
+ jmp NEAR $L$key_expansion_192b_warm
+
+ALIGN 16
+$L$key_expansion_256a:
+ movups XMMWORD[rax],xmm2
+ lea rax,[16+rax]
+$L$key_expansion_256a_cold:
+ shufps xmm4,xmm0,16
+ xorps xmm0,xmm4
+ shufps xmm4,xmm0,140
+ xorps xmm0,xmm4
+ shufps xmm1,xmm1,255
+ xorps xmm0,xmm1
+ ret
+
+ALIGN 16
+$L$key_expansion_256b:
+ movups XMMWORD[rax],xmm0
+ lea rax,[16+rax]
+
+ shufps xmm4,xmm2,16
+ xorps xmm2,xmm4
+ shufps xmm4,xmm2,140
+ xorps xmm2,xmm4
+ shufps xmm1,xmm1,170
+ xorps xmm2,xmm1
+ ret
+
+
+section .rdata rdata align=8
+ALIGN 64
+$L$bswap_mask:
+ DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$increment32:
+ DD 6,6,6,0
+$L$increment64:
+ DD 1,0,0,0
+$L$xts_magic:
+ DD 0x87,0,1,0
+$L$increment1:
+ DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$key_rotate:
+ DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+$L$key_rotate192:
+ DD 0x04070605,0x04070605,0x04070605,0x04070605
+$L$key_rcon1:
+ DD 1,1,1,1
+$L$key_rcon1b:
+ DD 0x1b,0x1b,0x1b,0x1b
+
+ DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+ DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+ DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+ DB 115,108,46,111,114,103,62,0
+ALIGN 64
+section .text
+
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+ecb_ccm64_se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[rax]
+ lea rdi,[512+r8]
+ mov ecx,8
+ DD 0xa548f3fc
+ lea rax,[88+rax]
+
+ jmp NEAR $L$common_seh_tail
+
+
+
+ALIGN 16
+ctr_xts_se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov rax,QWORD[208+r8]
+
+ lea rsi,[((-168))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+ mov rbp,QWORD[((-8))+rax]
+ mov QWORD[160+r8],rbp
+ jmp NEAR $L$common_seh_tail
+
+
+
+ALIGN 16
+cbc_se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[152+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$cbc_decrypt_bulk]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[120+r8]
+
+ lea r10,[$L$cbc_decrypt_body]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ lea r10,[$L$cbc_ret]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[16+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+ mov rax,QWORD[208+r8]
+
+ mov rbp,QWORD[((-8))+rax]
+ mov QWORD[160+r8],rbp
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_aes_hw_ecb_encrypt wrt ..imagebase
+ DD $L$SEH_end_aes_hw_ecb_encrypt wrt ..imagebase
+ DD $L$SEH_info_ecb wrt ..imagebase
+
+ DD $L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+ DD $L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase
+ DD $L$SEH_info_ctr32 wrt ..imagebase
+ DD $L$SEH_begin_aes_hw_cbc_encrypt wrt ..imagebase
+ DD $L$SEH_end_aes_hw_cbc_encrypt wrt ..imagebase
+ DD $L$SEH_info_cbc wrt ..imagebase
+
+ DD aes_hw_set_decrypt_key wrt ..imagebase
+ DD $L$SEH_end_set_decrypt_key wrt ..imagebase
+ DD $L$SEH_info_key wrt ..imagebase
+
+ DD aes_hw_set_encrypt_key wrt ..imagebase
+ DD $L$SEH_end_set_encrypt_key wrt ..imagebase
+ DD $L$SEH_info_key wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_ecb:
+ DB 9,0,0,0
+ DD ecb_ccm64_se_handler wrt ..imagebase
+ DD $L$ecb_enc_body wrt ..imagebase,$L$ecb_enc_ret wrt ..imagebase
+$L$SEH_info_ctr32:
+ DB 9,0,0,0
+ DD ctr_xts_se_handler wrt ..imagebase
+ DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase
+$L$SEH_info_cbc:
+ DB 9,0,0,0
+ DD cbc_se_handler wrt ..imagebase
+$L$SEH_info_key:
+ DB 0x01,0x04,0x01,0x00
+ DB 0x04,0x02,0x00,0x00
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/aesv8-armv7-linux.S b/gen/bcm/aesv8-armv7-linux.S
new file mode 100644
index 0000000..420af9b
--- /dev/null
+++ b/gen/bcm/aesv8-armv7-linux.S
@@ -0,0 +1,789 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-)
+.fpu neon
+.code 32
+#undef __thumb2__
+.align 5
+.Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,%function
+.align 5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+ mov r3,#-1
+ cmp r0,#0
+ beq .Lenc_key_abort
+ cmp r2,#0
+ beq .Lenc_key_abort
+ mov r3,#-2
+ cmp r1,#128
+ blt .Lenc_key_abort
+ cmp r1,#256
+ bgt .Lenc_key_abort
+ tst r1,#0x3f
+ bne .Lenc_key_abort
+
+ adr r3,.Lrcon
+ cmp r1,#192
+
+ veor q0,q0,q0
+ vld1.8 {q3},[r0]!
+ mov r1,#8 @ reuse r1
+ vld1.32 {q1,q2},[r3]!
+
+ blt .Loop128
+ beq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ bne .Loop128
+
+ vld1.32 {q1},[r3]
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]
+ add r2,r2,#0x50
+
+ mov r12,#10
+ b .Ldone
+
+.align 4
+.L192:
+ vld1.8 {d16},[r0]!
+ vmov.i8 q10,#8 @ borrow q10
+ vst1.32 {q3},[r2]!
+ vsub.i8 q2,q2,q10 @ adjust the mask
+
+.Loop192:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {d16},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+
+ vdup.32 q9,d7[1]
+ veor q9,q9,q8
+ veor q10,q10,q1
+ vext.8 q8,q0,q8,#12
+ vshl.u8 q1,q1,#1
+ veor q8,q8,q9
+ veor q3,q3,q10
+ veor q8,q8,q10
+ vst1.32 {q3},[r2]!
+ bne .Loop192
+
+ mov r12,#12
+ add r2,r2,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ vld1.8 {q8},[r0]
+ mov r1,#7
+ mov r12,#14
+ vst1.32 {q3},[r2]!
+
+.Loop256:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q8},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]!
+ beq .Ldone
+
+ vdup.32 q10,d7[1]
+ vext.8 q9,q0,q8,#12
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+
+ veor q8,q8,q10
+ b .Loop256
+
+.Ldone:
+ str r12,[r2]
+ mov r3,#0
+
+.Lenc_key_abort:
+ mov r0,r3 @ return value
+
+ bx lr
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,%function
+.align 5
+aes_hw_set_decrypt_key:
+ stmdb sp!,{r4,lr}
+ bl .Lenc_key
+
+ cmp r0,#0
+ bne .Ldec_key_abort
+
+ sub r2,r2,#240 @ restore original r2
+ mov r4,#-16
+ add r0,r2,r12,lsl#4 @ end of key schedule
+
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+
+.Loop_imc:
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+ cmp r0,r2
+ bhi .Loop_imc
+
+ vld1.32 {q0},[r2]
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ vst1.32 {q0},[r0]
+
+ eor r0,r0,r0 @ return value
+.Ldec_key_abort:
+ ldmia sp!,{r4,pc}
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,%function
+.align 5
+aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+.Loop_enc:
+.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]!
+ subs r3,r3,#2
+.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q1},[r2]!
+ bgt .Loop_enc
+
+.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]
+.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+.size aes_hw_encrypt,.-aes_hw_encrypt
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,%function
+.align 5
+aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+.Loop_dec:
+.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]!
+ subs r3,r3,#2
+.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q1},[r2]!
+ bgt .Loop_dec
+
+.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]
+.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+.size aes_hw_decrypt,.-aes_hw_decrypt
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,%function
+.align 5
+aes_hw_cbc_encrypt:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,lr}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldmia ip,{r4,r5} @ load remaining args
+ subs r2,r2,#16
+ mov r8,#16
+ blo .Lcbc_abort
+ moveq r8,#0
+
+ cmp r5,#0 @ en- or decrypting?
+ ldr r5,[r3,#240]
+ and r2,r2,#-16
+ vld1.8 {q6},[r4]
+ vld1.8 {q0},[r0],r8
+
+ vld1.32 {q8,q9},[r3] @ load key schedule...
+ sub r5,r5,#6
+ add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
+ sub r5,r5,#2
+ vld1.32 {q10,q11},[r7]!
+ vld1.32 {q12,q13},[r7]!
+ vld1.32 {q14,q15},[r7]!
+ vld1.32 {q7},[r7]
+
+ add r7,r3,#32
+ mov r6,r5
+ beq .Lcbc_dec
+
+ cmp r5,#2
+ veor q0,q0,q6
+ veor q5,q8,q7
+ beq .Lcbc_enc128
+
+ vld1.32 {q2,q3},[r7]
+ add r7,r3,#16
+ add r6,r3,#16*4
+ add r12,r3,#16*5
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ add r14,r3,#16*6
+ add r3,r3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
+.Loop_cbc_enc:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vst1.8 {q6},[r1]!
+.Lenter_cbc_enc:
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r6]
+ cmp r5,#4
+.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r12]
+ beq .Lcbc_enc192
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r14]
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r3]
+ nop
+
+.Lcbc_enc192:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q6,q0,q7
+ bhs .Loop_cbc_enc
+
+ vst1.8 {q6},[r1]!
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ vld1.32 {q2,q3},[r7]
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vst1.8 {q6},[r1]!
+.Lenter_cbc_enc128:
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q6,q0,q7
+ bhs .Loop_cbc_enc128
+
+ vst1.8 {q6},[r1]!
+ b .Lcbc_done
+.align 5
+.Lcbc_dec:
+ vld1.8 {q10},[r0]!
+ subs r2,r2,#32 @ bias
+ add r6,r5,#2
+ vorr q3,q0,q0
+ vorr q1,q0,q0
+ vorr q11,q10,q10
+ blo .Lcbc_dec_tail
+
+ vorr q1,q10,q10
+ vld1.8 {q10},[r0]!
+ vorr q2,q0,q0
+ vorr q3,q1,q1
+ vorr q11,q10,q10
+
+.Loop3x_cbc_dec:
+.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt .Loop3x_cbc_dec
+
+.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q4,q6,q7
+ subs r2,r2,#0x30
+ veor q5,q2,q7
+ movlo r6,r2 @ r6, r6, is zero at this point
+.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q9,q3,q7
+ add r0,r0,r6 @ r0 is adjusted in such way that
+ @ at exit from the loop q1-q10
+ @ are loaded with last "words"
+ vorr q6,q11,q11
+ mov r7,r3
+.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q2},[r0]!
+.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q3},[r0]!
+.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q11},[r0]!
+.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
+.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ add r6,r5,#2
+ veor q4,q4,q0
+ veor q5,q5,q1
+ veor q10,q10,q9
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vst1.8 {q4},[r1]!
+ vorr q0,q2,q2
+ vst1.8 {q5},[r1]!
+ vorr q1,q3,q3
+ vst1.8 {q10},[r1]!
+ vorr q10,q11,q11
+ bhs .Loop3x_cbc_dec
+
+ cmn r2,#0x30
+ beq .Lcbc_done
+ nop
+
+.Lcbc_dec_tail:
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt .Lcbc_dec_tail
+
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ cmn r2,#0x20
+.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q5,q6,q7
+.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q9,q3,q7
+.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
+ beq .Lcbc_dec_one
+ veor q5,q5,q1
+ veor q9,q9,q10
+ vorr q6,q11,q11
+ vst1.8 {q5},[r1]!
+ vst1.8 {q9},[r1]!
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ veor q5,q5,q10
+ vorr q6,q11,q11
+ vst1.8 {q5},[r1]!
+
+.Lcbc_done:
+ vst1.8 {q6},[r4]
+.Lcbc_abort:
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,pc}
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,%function
+.align 5
+aes_hw_ctr32_encrypt_blocks:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+ ldr r5,[r3,#240]
+
+ ldr r8, [r4, #12]
+ vld1.32 {q0},[r4]
+
+ vld1.32 {q8,q9},[r3] @ load key schedule...
+ sub r5,r5,#4
+ mov r12,#16
+ cmp r2,#2
+ add r7,r3,r5,lsl#4 @ pointer to last 5 round keys
+ sub r5,r5,#2
+ vld1.32 {q12,q13},[r7]!
+ vld1.32 {q14,q15},[r7]!
+ vld1.32 {q7},[r7]
+ add r7,r3,#32
+ mov r6,r5
+ movlo r12,#0
+
+ @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ @ affected by silicon errata #1742098 [0] and #1655431 [1],
+ @ respectively, where the second instruction of an aese/aesmc
+ @ instruction pair may execute twice if an interrupt is taken right
+ @ after the first instruction consumes an input register of which a
+ @ single 32-bit lane has been updated the last time it was modified.
+ @
+ @ This function uses a counter in one 32-bit lane. The
+ @ could write to q1 and q10 directly, but that trips this bugs.
+ @ We write to q6 and copy to the final register as a workaround.
+ @
+ @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+ rev r8, r8
+#endif
+ add r10, r8, #1
+ vorr q6,q0,q0
+ rev r10, r10
+ vmov.32 d13[1],r10
+ add r8, r8, #2
+ vorr q1,q6,q6
+ bls .Lctr32_tail
+ rev r12, r8
+ vmov.32 d13[1],r12
+ sub r2,r2,#3 @ bias
+ vorr q10,q6,q6
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt .Loop3x_ctr32
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
+ vld1.8 {q2},[r0]!
+ add r9,r8,#1
+.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.8 {q3},[r0]!
+ rev r9,r9
+.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vld1.8 {q11},[r0]!
+ mov r7,r3
+.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
+.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
+.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ veor q2,q2,q7
+ add r10,r8,#2
+.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ veor q3,q3,q7
+ add r8,r8,#3
+.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ @ Note the logic to update q0, q1, and q1 is written to work
+ @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ @ 32-bit mode. See the comment above.
+ veor q11,q11,q7
+ vmov.32 d13[1], r9
+.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ vorr q0,q6,q6
+ rev r10,r10
+.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ vmov.32 d13[1], r10
+ rev r12,r8
+.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vorr q1,q6,q6
+ vmov.32 d13[1], r12
+.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ vorr q10,q6,q6
+ subs r2,r2,#3
+.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
+.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
+.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
+
+ veor q2,q2,q4
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ vst1.8 {q2},[r1]!
+ veor q3,q3,q5
+ mov r6,r5
+ vst1.8 {q3},[r1]!
+ veor q11,q11,q9
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vst1.8 {q11},[r1]!
+ bhs .Loop3x_ctr32
+
+ adds r2,r2,#3
+ beq .Lctr32_done
+ cmp r2,#1
+ mov r12,#16
+ moveq r12,#0
+
+.Lctr32_tail:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q9},[r7]!
+ bgt .Lctr32_tail
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q2},[r0],r12
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q3},[r0]
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ veor q2,q2,q7
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ veor q3,q3,q7
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
+
+ cmp r2,#1
+ veor q2,q2,q0
+ veor q3,q3,q1
+ vst1.8 {q2},[r1]!
+ beq .Lctr32_done
+ vst1.8 {q3},[r1]
+
+.Lctr32_done:
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/aesv8-armv8-apple.S b/gen/bcm/aesv8-armv8-apple.S
new file mode 100644
index 0000000..144c4af
--- /dev/null
+++ b/gen/bcm/aesv8-armv8-apple.S
@@ -0,0 +1,791 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.section __TEXT,__const
+.align 5
+Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl _aes_hw_set_encrypt_key
+.private_extern _aes_hw_set_encrypt_key
+
+.align 5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x3,#-1
+ cmp x0,#0
+ b.eq Lenc_key_abort
+ cmp x2,#0
+ b.eq Lenc_key_abort
+ mov x3,#-2
+ cmp w1,#128
+ b.lt Lenc_key_abort
+ cmp w1,#256
+ b.gt Lenc_key_abort
+ tst w1,#0x3f
+ b.ne Lenc_key_abort
+
+ adrp x3,Lrcon@PAGE
+ add x3,x3,Lrcon@PAGEOFF
+ cmp w1,#192
+
+ eor v0.16b,v0.16b,v0.16b
+ ld1 {v3.16b},[x0],#16
+ mov w1,#8 // reuse w1
+ ld1 {v1.4s,v2.4s},[x3],#32
+
+ b.lt Loop128
+ b.eq L192
+ b L256
+
+.align 4
+Loop128:
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ b.ne Loop128
+
+ ld1 {v1.4s},[x3]
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2]
+ add x2,x2,#0x50
+
+ mov w12,#10
+ b Ldone
+
+.align 4
+L192:
+ ld1 {v4.8b},[x0],#8
+ movi v6.16b,#8 // borrow v6.16b
+ st1 {v3.4s},[x2],#16
+ sub v2.16b,v2.16b,v6.16b // adjust the mask
+
+Loop192:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.8b},[x2],#8
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+
+ dup v5.4s,v3.s[3]
+ eor v5.16b,v5.16b,v4.16b
+ eor v6.16b,v6.16b,v1.16b
+ ext v4.16b,v0.16b,v4.16b,#12
+ shl v1.16b,v1.16b,#1
+ eor v4.16b,v4.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ eor v4.16b,v4.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.ne Loop192
+
+ mov w12,#12
+ add x2,x2,#0x20
+ b Ldone
+
+.align 4
+L256:
+ ld1 {v4.16b},[x0]
+ mov w1,#7
+ mov w12,#14
+ st1 {v3.4s},[x2],#16
+
+Loop256:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.eq Ldone
+
+ dup v6.4s,v3.s[3] // just splat
+ ext v5.16b,v0.16b,v4.16b,#12
+ aese v6.16b,v0.16b
+
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+
+ eor v4.16b,v4.16b,v6.16b
+ b Loop256
+
+Ldone:
+ str w12,[x2]
+ mov x3,#0
+
+Lenc_key_abort:
+ mov x0,x3 // return value
+ ldr x29,[sp],#16
+ ret
+
+
+.globl _aes_hw_set_decrypt_key
+.private_extern _aes_hw_set_decrypt_key
+
+.align 5
+_aes_hw_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ bl Lenc_key
+
+ cmp x0,#0
+ b.ne Ldec_key_abort
+
+ sub x2,x2,#240 // restore original x2
+ mov x4,#-16
+ add x0,x2,x12,lsl#4 // end of key schedule
+
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+
+Loop_imc:
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+ cmp x0,x2
+ b.hi Loop_imc
+
+ ld1 {v0.4s},[x2]
+ aesimc v0.16b,v0.16b
+ st1 {v0.4s},[x0]
+
+ eor x0,x0,x0 // return value
+Ldec_key_abort:
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _aes_hw_encrypt
+.private_extern _aes_hw_encrypt
+
+.align 5
+_aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+Loop_enc:
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aese v2.16b,v1.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt Loop_enc
+
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aese v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+
+.globl _aes_hw_decrypt
+.private_extern _aes_hw_decrypt
+
+.align 5
+_aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+Loop_dec:
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aesd v2.16b,v1.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt Loop_dec
+
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aesd v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+
+.globl _aes_hw_cbc_encrypt
+.private_extern _aes_hw_cbc_encrypt
+
+.align 5
+_aes_hw_cbc_encrypt:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo Lcbc_abort
+ csel x8,xzr,x8,eq
+
+ cmp w5,#0 // en- or decrypting?
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4 // pointer to last 7 round keys
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+ b.eq Lcbc_dec
+
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq Lcbc_enc128
+
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b Lenter_cbc_enc
+
+.align 4
+Loop_cbc_enc:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq Lcbc_enc192
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
+
+Lcbc_enc192:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc
+
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
+
+.align 5
+Lcbc_enc128:
+ ld1 {v2.4s,v3.4s},[x7]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ b Lenter_cbc_enc128
+Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc128:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc128
+
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
+.align 5
+Lcbc_dec:
+ ld1 {v18.16b},[x0],#16
+ subs x2,x2,#32 // bias
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v19.16b,v18.16b,v18.16b
+ b.lo Lcbc_dec_tail
+
+ orr v1.16b,v18.16b,v18.16b
+ ld1 {v18.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6 // x0 is adjusted in such way that
+ // at exit from the loop v1.16b-v18.16b
+ // are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v19.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v18.16b,v18.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v18.16b},[x1],#16
+ orr v18.16b,v19.16b,v19.16b
+ b.hs Loop3x_cbc_dec
+
+ cmn x2,#0x30
+ b.eq Lcbc_done
+ nop
+
+Lcbc_dec_tail:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lcbc_dec_tail
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ b.eq Lcbc_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b Lcbc_done
+
+Lcbc_dec_one:
+ eor v5.16b,v5.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+
+Lcbc_done:
+ st1 {v6.16b},[x4]
+Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+
+.globl _aes_hw_ctr32_encrypt_blocks
+.private_extern _aes_hw_ctr32_encrypt_blocks
+
+.align 5
+_aes_hw_ctr32_encrypt_blocks:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last 5 round keys
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+ csel x12,xzr,x12,lo
+
+ // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ // affected by silicon errata #1742098 [0] and #1655431 [1],
+ // respectively, where the second instruction of an aese/aesmc
+ // instruction pair may execute twice if an interrupt is taken right
+ // after the first instruction consumes an input register of which a
+ // single 32-bit lane has been updated the last time it was modified.
+ //
+ // This function uses a counter in one 32-bit lane. The vmov lines
+ // could write to v1.16b and v18.16b directly, but that trips this bugs.
+ // We write to v6.16b and copy to the final register as a workaround.
+ //
+ // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+ rev w8, w8
+#endif
+ add w10, w8, #1
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v6.s[3],w10
+ add w8, w8, #2
+ orr v1.16b,v6.16b,v6.16b
+ b.ls Lctr32_tail
+ rev w12, w8
+ mov v6.s[3],w12
+ sub x2,x2,#3 // bias
+ orr v18.16b,v6.16b,v6.16b
+ b Loop3x_ctr32
+
+.align 4
+Loop3x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ add w9,w8,#1
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ rev w9,w9
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+ // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ // 32-bit mode. See the comment above.
+ eor v19.16b,v19.16b,v7.16b
+ mov v6.s[3], w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ orr v0.16b,v6.16b,v6.16b
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ mov v6.s[3], w10
+ rev w12,w8
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ orr v1.16b,v6.16b,v6.16b
+ mov v6.s[3], w12
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ orr v18.16b,v6.16b,v6.16b
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v19.16b},[x1],#16
+ b.hs Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq Lctr32_done
+ cmp x2,#1
+ mov x12,#16
+ csel x12,xzr,x12,eq
+
+Lctr32_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq Lctr32_done
+ st1 {v3.16b},[x1]
+
+Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/aesv8-armv8-linux.S b/gen/bcm/aesv8-armv8-linux.S
new file mode 100644
index 0000000..7d4bcb4
--- /dev/null
+++ b/gen/bcm/aesv8-armv8-linux.S
@@ -0,0 +1,791 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv8-a+crypto
+.section .rodata
+.align 5
+.Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,%function
+.align 5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x3,#-1
+ cmp x0,#0
+ b.eq .Lenc_key_abort
+ cmp x2,#0
+ b.eq .Lenc_key_abort
+ mov x3,#-2
+ cmp w1,#128
+ b.lt .Lenc_key_abort
+ cmp w1,#256
+ b.gt .Lenc_key_abort
+ tst w1,#0x3f
+ b.ne .Lenc_key_abort
+
+ adrp x3,.Lrcon
+ add x3,x3,:lo12:.Lrcon
+ cmp w1,#192
+
+ eor v0.16b,v0.16b,v0.16b
+ ld1 {v3.16b},[x0],#16
+ mov w1,#8 // reuse w1
+ ld1 {v1.4s,v2.4s},[x3],#32
+
+ b.lt .Loop128
+ b.eq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ b.ne .Loop128
+
+ ld1 {v1.4s},[x3]
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2]
+ add x2,x2,#0x50
+
+ mov w12,#10
+ b .Ldone
+
+.align 4
+.L192:
+ ld1 {v4.8b},[x0],#8
+ movi v6.16b,#8 // borrow v6.16b
+ st1 {v3.4s},[x2],#16
+ sub v2.16b,v2.16b,v6.16b // adjust the mask
+
+.Loop192:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.8b},[x2],#8
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+
+ dup v5.4s,v3.s[3]
+ eor v5.16b,v5.16b,v4.16b
+ eor v6.16b,v6.16b,v1.16b
+ ext v4.16b,v0.16b,v4.16b,#12
+ shl v1.16b,v1.16b,#1
+ eor v4.16b,v4.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ eor v4.16b,v4.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.ne .Loop192
+
+ mov w12,#12
+ add x2,x2,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ ld1 {v4.16b},[x0]
+ mov w1,#7
+ mov w12,#14
+ st1 {v3.4s},[x2],#16
+
+.Loop256:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.eq .Ldone
+
+ dup v6.4s,v3.s[3] // just splat
+ ext v5.16b,v0.16b,v4.16b,#12
+ aese v6.16b,v0.16b
+
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+
+ eor v4.16b,v4.16b,v6.16b
+ b .Loop256
+
+.Ldone:
+ str w12,[x2]
+ mov x3,#0
+
+.Lenc_key_abort:
+ mov x0,x3 // return value
+ ldr x29,[sp],#16
+ ret
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,%function
+.align 5
+aes_hw_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ bl .Lenc_key
+
+ cmp x0,#0
+ b.ne .Ldec_key_abort
+
+ sub x2,x2,#240 // restore original x2
+ mov x4,#-16
+ add x0,x2,x12,lsl#4 // end of key schedule
+
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+
+.Loop_imc:
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+ cmp x0,x2
+ b.hi .Loop_imc
+
+ ld1 {v0.4s},[x2]
+ aesimc v0.16b,v0.16b
+ st1 {v0.4s},[x0]
+
+ eor x0,x0,x0 // return value
+.Ldec_key_abort:
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,%function
+.align 5
+aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+.Loop_enc:
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aese v2.16b,v1.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt .Loop_enc
+
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aese v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+.size aes_hw_encrypt,.-aes_hw_encrypt
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,%function
+.align 5
+aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+.Loop_dec:
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aesd v2.16b,v1.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt .Loop_dec
+
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aesd v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+.size aes_hw_decrypt,.-aes_hw_decrypt
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,%function
+.align 5
+aes_hw_cbc_encrypt:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo .Lcbc_abort
+ csel x8,xzr,x8,eq
+
+ cmp w5,#0 // en- or decrypting?
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4 // pointer to last 7 round keys
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+ b.eq .Lcbc_dec
+
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq .Lcbc_enc128
+
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
+.Loop_cbc_enc:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq .Lcbc_enc192
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
+
+.Lcbc_enc192:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs .Loop_cbc_enc
+
+ st1 {v6.16b},[x1],#16
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ ld1 {v2.4s,v3.4s},[x7]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc128:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs .Loop_cbc_enc128
+
+ st1 {v6.16b},[x1],#16
+ b .Lcbc_done
+.align 5
+.Lcbc_dec:
+ ld1 {v18.16b},[x0],#16
+ subs x2,x2,#32 // bias
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v19.16b,v18.16b,v18.16b
+ b.lo .Lcbc_dec_tail
+
+ orr v1.16b,v18.16b,v18.16b
+ ld1 {v18.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v19.16b,v18.16b,v18.16b
+
+.Loop3x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6 // x0 is adjusted in such way that
+ // at exit from the loop v1.16b-v18.16b
+ // are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v19.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v18.16b,v18.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v18.16b},[x1],#16
+ orr v18.16b,v19.16b,v19.16b
+ b.hs .Loop3x_cbc_dec
+
+ cmn x2,#0x30
+ b.eq .Lcbc_done
+ nop
+
+.Lcbc_dec_tail:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lcbc_dec_tail
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ b.eq .Lcbc_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ eor v5.16b,v5.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+
+.Lcbc_done:
+ st1 {v6.16b},[x4]
+.Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,%function
+.align 5
+aes_hw_ctr32_encrypt_blocks:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last 5 round keys
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+ csel x12,xzr,x12,lo
+
+ // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ // affected by silicon errata #1742098 [0] and #1655431 [1],
+ // respectively, where the second instruction of an aese/aesmc
+ // instruction pair may execute twice if an interrupt is taken right
+ // after the first instruction consumes an input register of which a
+ // single 32-bit lane has been updated the last time it was modified.
+ //
+ // This function uses a counter in one 32-bit lane. The vmov lines
+ // could write to v1.16b and v18.16b directly, but that trips this bugs.
+ // We write to v6.16b and copy to the final register as a workaround.
+ //
+ // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+ rev w8, w8
+#endif
+ add w10, w8, #1
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v6.s[3],w10
+ add w8, w8, #2
+ orr v1.16b,v6.16b,v6.16b
+ b.ls .Lctr32_tail
+ rev w12, w8
+ mov v6.s[3],w12
+ sub x2,x2,#3 // bias
+ orr v18.16b,v6.16b,v6.16b
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ add w9,w8,#1
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ rev w9,w9
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+ // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ // 32-bit mode. See the comment above.
+ eor v19.16b,v19.16b,v7.16b
+ mov v6.s[3], w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ orr v0.16b,v6.16b,v6.16b
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ mov v6.s[3], w10
+ rev w12,w8
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ orr v1.16b,v6.16b,v6.16b
+ mov v6.s[3], w12
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ orr v18.16b,v6.16b,v6.16b
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v19.16b},[x1],#16
+ b.hs .Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq .Lctr32_done
+ cmp x2,#1
+ mov x12,#16
+ csel x12,xzr,x12,eq
+
+.Lctr32_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq .Lctr32_done
+ st1 {v3.16b},[x1]
+
+.Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/aesv8-armv8-win.S b/gen/bcm/aesv8-armv8-win.S
new file mode 100644
index 0000000..a3ab33a
--- /dev/null
+++ b/gen/bcm/aesv8-armv8-win.S
@@ -0,0 +1,803 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv8-a+crypto
+.section .rodata
+.align 5
+Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl aes_hw_set_encrypt_key
+
+.def aes_hw_set_encrypt_key
+ .type 32
+.endef
+.align 5
+aes_hw_set_encrypt_key:
+Lenc_key:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x3,#-1
+ cmp x0,#0
+ b.eq Lenc_key_abort
+ cmp x2,#0
+ b.eq Lenc_key_abort
+ mov x3,#-2
+ cmp w1,#128
+ b.lt Lenc_key_abort
+ cmp w1,#256
+ b.gt Lenc_key_abort
+ tst w1,#0x3f
+ b.ne Lenc_key_abort
+
+ adrp x3,Lrcon
+ add x3,x3,:lo12:Lrcon
+ cmp w1,#192
+
+ eor v0.16b,v0.16b,v0.16b
+ ld1 {v3.16b},[x0],#16
+ mov w1,#8 // reuse w1
+ ld1 {v1.4s,v2.4s},[x3],#32
+
+ b.lt Loop128
+ b.eq L192
+ b L256
+
+.align 4
+Loop128:
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ b.ne Loop128
+
+ ld1 {v1.4s},[x3]
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2]
+ add x2,x2,#0x50
+
+ mov w12,#10
+ b Ldone
+
+.align 4
+L192:
+ ld1 {v4.8b},[x0],#8
+ movi v6.16b,#8 // borrow v6.16b
+ st1 {v3.4s},[x2],#16
+ sub v2.16b,v2.16b,v6.16b // adjust the mask
+
+Loop192:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.8b},[x2],#8
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+
+ dup v5.4s,v3.s[3]
+ eor v5.16b,v5.16b,v4.16b
+ eor v6.16b,v6.16b,v1.16b
+ ext v4.16b,v0.16b,v4.16b,#12
+ shl v1.16b,v1.16b,#1
+ eor v4.16b,v4.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ eor v4.16b,v4.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.ne Loop192
+
+ mov w12,#12
+ add x2,x2,#0x20
+ b Ldone
+
+.align 4
+L256:
+ ld1 {v4.16b},[x0]
+ mov w1,#7
+ mov w12,#14
+ st1 {v3.4s},[x2],#16
+
+Loop256:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.eq Ldone
+
+ dup v6.4s,v3.s[3] // just splat
+ ext v5.16b,v0.16b,v4.16b,#12
+ aese v6.16b,v0.16b
+
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+
+ eor v4.16b,v4.16b,v6.16b
+ b Loop256
+
+Ldone:
+ str w12,[x2]
+ mov x3,#0
+
+Lenc_key_abort:
+ mov x0,x3 // return value
+ ldr x29,[sp],#16
+ ret
+
+
+.globl aes_hw_set_decrypt_key
+
+.def aes_hw_set_decrypt_key
+ .type 32
+.endef
+.align 5
+aes_hw_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ bl Lenc_key
+
+ cmp x0,#0
+ b.ne Ldec_key_abort
+
+ sub x2,x2,#240 // restore original x2
+ mov x4,#-16
+ add x0,x2,x12,lsl#4 // end of key schedule
+
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+
+Loop_imc:
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+ cmp x0,x2
+ b.hi Loop_imc
+
+ ld1 {v0.4s},[x2]
+ aesimc v0.16b,v0.16b
+ st1 {v0.4s},[x0]
+
+ eor x0,x0,x0 // return value
+Ldec_key_abort:
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl aes_hw_encrypt
+
+.def aes_hw_encrypt
+ .type 32
+.endef
+.align 5
+aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+Loop_enc:
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aese v2.16b,v1.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt Loop_enc
+
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aese v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+
+.globl aes_hw_decrypt
+
+.def aes_hw_decrypt
+ .type 32
+.endef
+.align 5
+aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+Loop_dec:
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aesd v2.16b,v1.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt Loop_dec
+
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aesd v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+
+.globl aes_hw_cbc_encrypt
+
+.def aes_hw_cbc_encrypt
+ .type 32
+.endef
+.align 5
+aes_hw_cbc_encrypt:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo Lcbc_abort
+ csel x8,xzr,x8,eq
+
+ cmp w5,#0 // en- or decrypting?
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4 // pointer to last 7 round keys
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+ b.eq Lcbc_dec
+
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq Lcbc_enc128
+
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b Lenter_cbc_enc
+
+.align 4
+Loop_cbc_enc:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq Lcbc_enc192
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
+
+Lcbc_enc192:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc
+
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
+
+.align 5
+Lcbc_enc128:
+ ld1 {v2.4s,v3.4s},[x7]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ b Lenter_cbc_enc128
+Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc128:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc128
+
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
+.align 5
+Lcbc_dec:
+ ld1 {v18.16b},[x0],#16
+ subs x2,x2,#32 // bias
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v19.16b,v18.16b,v18.16b
+ b.lo Lcbc_dec_tail
+
+ orr v1.16b,v18.16b,v18.16b
+ ld1 {v18.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6 // x0 is adjusted in such way that
+ // at exit from the loop v1.16b-v18.16b
+ // are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v19.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v18.16b,v18.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v18.16b},[x1],#16
+ orr v18.16b,v19.16b,v19.16b
+ b.hs Loop3x_cbc_dec
+
+ cmn x2,#0x30
+ b.eq Lcbc_done
+ nop
+
+Lcbc_dec_tail:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lcbc_dec_tail
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ b.eq Lcbc_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b Lcbc_done
+
+Lcbc_dec_one:
+ eor v5.16b,v5.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+
+Lcbc_done:
+ st1 {v6.16b},[x4]
+Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+
+.globl aes_hw_ctr32_encrypt_blocks
+
+.def aes_hw_ctr32_encrypt_blocks
+ .type 32
+.endef
+.align 5
+aes_hw_ctr32_encrypt_blocks:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last 5 round keys
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+ csel x12,xzr,x12,lo
+
+ // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ // affected by silicon errata #1742098 [0] and #1655431 [1],
+ // respectively, where the second instruction of an aese/aesmc
+ // instruction pair may execute twice if an interrupt is taken right
+ // after the first instruction consumes an input register of which a
+ // single 32-bit lane has been updated the last time it was modified.
+ //
+ // This function uses a counter in one 32-bit lane. The vmov lines
+ // could write to v1.16b and v18.16b directly, but that trips this bugs.
+ // We write to v6.16b and copy to the final register as a workaround.
+ //
+ // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+ rev w8, w8
+#endif
+ add w10, w8, #1
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v6.s[3],w10
+ add w8, w8, #2
+ orr v1.16b,v6.16b,v6.16b
+ b.ls Lctr32_tail
+ rev w12, w8
+ mov v6.s[3],w12
+ sub x2,x2,#3 // bias
+ orr v18.16b,v6.16b,v6.16b
+ b Loop3x_ctr32
+
+.align 4
+Loop3x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ add w9,w8,#1
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ rev w9,w9
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+ // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ // 32-bit mode. See the comment above.
+ eor v19.16b,v19.16b,v7.16b
+ mov v6.s[3], w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ orr v0.16b,v6.16b,v6.16b
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ mov v6.s[3], w10
+ rev w12,w8
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ orr v1.16b,v6.16b,v6.16b
+ mov v6.s[3], w12
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ orr v18.16b,v6.16b,v6.16b
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v19.16b},[x1],#16
+ b.hs Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq Lctr32_done
+ cmp x2,#1
+ mov x12,#16
+ csel x12,xzr,x12,eq
+
+Lctr32_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq Lctr32_done
+ st1 {v3.16b},[x1]
+
+Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/aesv8-gcm-armv8-apple.S b/gen/bcm/aesv8-gcm-armv8-apple.S
new file mode 100644
index 0000000..13be797
--- /dev/null
+++ b/gen/bcm/aesv8-gcm-armv8-apple.S
@@ -0,0 +1,1555 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+
+.text
+.globl _aes_gcm_enc_kernel
+.private_extern _aes_gcm_enc_kernel
+
+.align 4
+_aes_gcm_enc_kernel:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-128]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp d8, d9, [sp, #64]
+ stp d10, d11, [sp, #80]
+ stp d12, d13, [sp, #96]
+ stp d14, d15, [sp, #112]
+ ldr w17, [x8, #240]
+ add x19, x8, x17, lsl #4 // borrow input_l1 for last key
+ ldp x13, x14, [x19] // load round N keys
+ ldr q31, [x19, #-16] // load round N-1 keys
+ add x4, x0, x1, lsr #3 // end_input_ptr
+ lsr x5, x1, #3 // byte_len
+ mov x15, x5
+ ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
+ ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
+ sub x5, x5, #1 // byte_len - 1
+ ldr q18, [x8, #0] // load rk0
+ and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ ldr q25, [x8, #112] // load rk7
+ add x5, x5, x0
+ lsr x12, x11, #32
+ fmov d2, x10 // CTR block 2
+ orr w11, w11, w11
+ rev w12, w12 // rev_ctr32
+ fmov d1, x10 // CTR block 1
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 0
+ add w12, w12, #1 // increment rev_ctr32
+ rev w9, w12 // CTR block 1
+ fmov d3, x10 // CTR block 3
+ orr x9, x11, x9, lsl #32 // CTR block 1
+ add w12, w12, #1 // CTR block 1
+ ldr q19, [x8, #16] // load rk1
+ fmov v1.d[1], x9 // CTR block 1
+ rev w9, w12 // CTR block 2
+ add w12, w12, #1 // CTR block 2
+ orr x9, x11, x9, lsl #32 // CTR block 2
+ ldr q20, [x8, #32] // load rk2
+ fmov v2.d[1], x9 // CTR block 2
+ rev w9, w12 // CTR block 3
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 1
+ orr x9, x11, x9, lsl #32 // CTR block 3
+ fmov v3.d[1], x9 // CTR block 3
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 0
+ ldr q21, [x8, #48] // load rk3
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 2
+ ldr q24, [x8, #96] // load rk6
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 0
+ ldr q23, [x8, #80] // load rk5
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 1
+ ldr q14, [x6, #48] // load h3l | h3h
+ ext v14.16b, v14.16b, v14.16b, #8
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 0
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 1
+ ldr q22, [x8, #64] // load rk4
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 2
+ ldr q13, [x6, #32] // load h2l | h2h
+ ext v13.16b, v13.16b, v13.16b, #8
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 1
+ ldr q30, [x8, #192] // load rk12
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 2
+ ldr q15, [x6, #80] // load h4l | h4h
+ ext v15.16b, v15.16b, v15.16b, #8
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 3
+ ldr q29, [x8, #176] // load rk11
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 2
+ ldr q26, [x8, #128] // load rk8
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 3
+ add w12, w12, #1 // CTR block 3
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 3
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 3
+ ld1 { v11.16b}, [x3]
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 4
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 4
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 4
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 4
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 5
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 5
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 5
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 6
+ trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 6
+ ldr q27, [x8, #144] // load rk9
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 6
+ ldr q12, [x6] // load h1l | h1h
+ ext v12.16b, v12.16b, v12.16b, #8
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 6
+ ldr q28, [x8, #160] // load rk10
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 7
+ trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 7
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 7
+ trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 8
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 8
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 8
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 8
+ b.lt Lenc_finish_first_blocks // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 9
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 10
+ b.eq Lenc_finish_first_blocks // branch if AES-192
+
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 12
+
+Lenc_finish_first_blocks:
+ cmp x0, x5 // check if we have <= 4 blocks
+ eor v17.16b, v17.16b, v9.16b // h4k | h3k
+ aese v2.16b, v31.16b // AES block 2 - round N-1
+ trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
+ aese v1.16b, v31.16b // AES block 1 - round N-1
+ aese v0.16b, v31.16b // AES block 0 - round N-1
+ aese v3.16b, v31.16b // AES block 3 - round N-1
+ eor v16.16b, v16.16b, v8.16b // h2k | h1k
+ b.ge Lenc_tail // handle tail
+
+ ldp x19, x20, [x0, #16] // AES block 1 - load plaintext
+ rev w9, w12 // CTR block 4
+ ldp x6, x7, [x0, #0] // AES block 0 - load plaintext
+ ldp x23, x24, [x0, #48] // AES block 3 - load plaintext
+ ldp x21, x22, [x0, #32] // AES block 2 - load plaintext
+ add x0, x0, #64 // AES input_ptr update
+ eor x19, x19, x13 // AES block 1 - round N low
+ eor x20, x20, x14 // AES block 1 - round N high
+ fmov d5, x19 // AES block 1 - mov low
+ eor x6, x6, x13 // AES block 0 - round N low
+ eor x7, x7, x14 // AES block 0 - round N high
+ eor x24, x24, x14 // AES block 3 - round N high
+ fmov d4, x6 // AES block 0 - mov low
+ cmp x0, x5 // check if we have <= 8 blocks
+ fmov v4.d[1], x7 // AES block 0 - mov high
+ eor x23, x23, x13 // AES block 3 - round N low
+ eor x21, x21, x13 // AES block 2 - round N low
+ fmov v5.d[1], x20 // AES block 1 - mov high
+ fmov d6, x21 // AES block 2 - mov low
+ add w12, w12, #1 // CTR block 4
+ orr x9, x11, x9, lsl #32 // CTR block 4
+ fmov d7, x23 // AES block 3 - mov low
+ eor x22, x22, x14 // AES block 2 - round N high
+ fmov v6.d[1], x22 // AES block 2 - mov high
+ eor v4.16b, v4.16b, v0.16b // AES block 0 - result
+ fmov d0, x10 // CTR block 4
+ fmov v0.d[1], x9 // CTR block 4
+ rev w9, w12 // CTR block 5
+ add w12, w12, #1 // CTR block 5
+ eor v5.16b, v5.16b, v1.16b // AES block 1 - result
+ fmov d1, x10 // CTR block 5
+ orr x9, x11, x9, lsl #32 // CTR block 5
+ fmov v1.d[1], x9 // CTR block 5
+ rev w9, w12 // CTR block 6
+ st1 { v4.16b}, [x2], #16 // AES block 0 - store result
+ fmov v7.d[1], x24 // AES block 3 - mov high
+ orr x9, x11, x9, lsl #32 // CTR block 6
+ eor v6.16b, v6.16b, v2.16b // AES block 2 - result
+ st1 { v5.16b}, [x2], #16 // AES block 1 - store result
+ add w12, w12, #1 // CTR block 6
+ fmov d2, x10 // CTR block 6
+ fmov v2.d[1], x9 // CTR block 6
+ st1 { v6.16b}, [x2], #16 // AES block 2 - store result
+ rev w9, w12 // CTR block 7
+ orr x9, x11, x9, lsl #32 // CTR block 7
+ eor v7.16b, v7.16b, v3.16b // AES block 3 - result
+ st1 { v7.16b}, [x2], #16 // AES block 3 - store result
+ b.ge Lenc_prepretail // do prepretail
+
+Lenc_main_loop: // main loop start
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d3, x10 // CTR block 4k+3
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ fmov v3.d[1], x9 // CTR block 4k+3
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ eor x23, x23, x13 // AES block 4k+7 - round N low
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ eor x22, x22, x14 // AES block 4k+6 - round N high
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ mov d4, v7.d[1] // GHASH block 4k+3 - mid
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor x19, x19, x13 // AES block 4k+5 - round N low
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ eor x21, x21, x13 // AES block 4k+6 - round N low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ movi v8.8b, #0xc2
+ pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ fmov d5, x19 // AES block 4k+5 - mov low
+ ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext
+ b.lt Lenc_main_loop_continue // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ b.eq Lenc_main_loop_continue // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+Lenc_main_loop_continue:
+ shl d8, d8, #56 // mod_constant
+ eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
+ add w12, w12, #1 // CTR block 4k+3
+ eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ add x0, x0, #64 // AES input_ptr update
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ rev w9, w12 // CTR block 4k+8
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ fmov d4, x6 // AES block 4k+4 - mov low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+8
+ eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid
+ eor x20, x20, x14 // AES block 4k+5 - round N high
+ eor x24, x24, x14 // AES block 4k+7 - round N high
+ add w12, w12, #1 // CTR block 4k+8
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ fmov v4.d[1], x7 // AES block 4k+4 - mov high
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ fmov d7, x23 // AES block 4k+7 - mov low
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ fmov v5.d[1], x20 // AES block 4k+5 - mov high
+ fmov d6, x21 // AES block 4k+6 - mov low
+ cmp x0, x5 // LOOP CONTROL
+ fmov v6.d[1], x22 // AES block 4k+6 - mov high
+ pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ fmov d0, x10 // CTR block 4k+8
+ fmov v0.d[1], x9 // CTR block 4k+8
+ rev w9, w12 // CTR block 4k+9
+ add w12, w12, #1 // CTR block 4k+9
+ eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result
+ fmov d1, x10 // CTR block 4k+9
+ orr x9, x11, x9, lsl #32 // CTR block 4k+9
+ fmov v1.d[1], x9 // CTR block 4k+9
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ rev w9, w12 // CTR block 4k+10
+ st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result
+ orr x9, x11, x9, lsl #32 // CTR block 4k+10
+ eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
+ fmov v7.d[1], x24 // AES block 4k+7 - mov high
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result
+ add w12, w12, #1 // CTR block 4k+10
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result
+ fmov d2, x10 // CTR block 4k+10
+ st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result
+ fmov v2.d[1], x9 // CTR block 4k+10
+ rev w9, w12 // CTR block 4k+11
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+11
+ eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result
+ st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result
+ b.lt Lenc_main_loop
+
+Lenc_prepretail: // PREPRETAIL
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ fmov d3, x10 // CTR block 4k+3
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
+ fmov v3.d[1], x9 // CTR block 4k+3
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ add w12, w12, #1 // CTR block 4k+3
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ mov d4, v7.d[1] // GHASH block 4k+3 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ movi v8.8b, #0xc2
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ shl d8, d8, #56 // mod_constant
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
+ pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v10.16b, v10.16b, v9.16b // karatsuba tidy up
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ pmull v4.1q, v9.1d, v8.1d
+ ext v9.16b, v9.16b, v9.16b, #8
+ eor v10.16b, v10.16b, v11.16b
+ b.lt Lenc_finish_prepretail // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ b.eq Lenc_finish_prepretail // branch if AES-192
+
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+
+Lenc_finish_prepretail:
+ eor v10.16b, v10.16b, v4.16b
+ eor v10.16b, v10.16b, v9.16b
+ pmull v4.1q, v10.1d, v8.1d
+ ext v10.16b, v10.16b, v10.16b, #8
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ eor v11.16b, v11.16b, v4.16b
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ eor v11.16b, v11.16b, v10.16b
+
+Lenc_tail: // TAIL
+ ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
+ sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
+ ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ cmp x5, #48
+ fmov d4, x6 // AES block 4k+4 - mov low
+ fmov v4.d[1], x7 // AES block 4k+4 - mov high
+ eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ b.gt Lenc_blocks_more_than_3
+ cmp x5, #32
+ mov v3.16b, v2.16b
+ movi v11.8b, #0
+ movi v9.8b, #0
+ sub w12, w12, #1
+ mov v2.16b, v1.16b
+ movi v10.8b, #0
+ b.gt Lenc_blocks_more_than_2
+ mov v3.16b, v1.16b
+ sub w12, w12, #1
+ cmp x5, #16
+ b.gt Lenc_blocks_more_than_1
+ sub w12, w12, #1
+ b Lenc_blocks_less_than_1
+Lenc_blocks_more_than_3: // blocks left > 3
+ st1 { v5.16b}, [x2], #16 // AES final-3 block - store result
+ ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high
+ rev64 v4.16b, v5.16b // GHASH final-3 block
+ eor x6, x6, x13 // AES final-2 block - round N low
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ eor x7, x7, x14 // AES final-2 block - round N high
+ mov d22, v4.d[1] // GHASH final-3 block - mid
+ fmov d5, x6 // AES final-2 block - mov low
+ fmov v5.d[1], x7 // AES final-2 block - mov high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
+ movi v8.8b, #0 // suppress further partial tag feed in
+ mov d10, v17.d[1] // GHASH final-3 block - mid
+ pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
+ pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
+ eor v5.16b, v5.16b, v1.16b // AES final-2 block - result
+Lenc_blocks_more_than_2: // blocks left > 2
+ st1 { v5.16b}, [x2], #16 // AES final-2 block - store result
+ ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high
+ rev64 v4.16b, v5.16b // GHASH final-2 block
+ eor x6, x6, x13 // AES final-1 block - round N low
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ fmov d5, x6 // AES final-1 block - mov low
+ eor x7, x7, x14 // AES final-1 block - round N high
+ fmov v5.d[1], x7 // AES final-1 block - mov high
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
+ mov d22, v4.d[1] // GHASH final-2 block - mid
+ pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
+ eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
+ eor v5.16b, v5.16b, v2.16b // AES final-1 block - result
+ eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
+ pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
+ eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
+Lenc_blocks_more_than_1: // blocks left > 1
+ st1 { v5.16b}, [x2], #16 // AES final-1 block - store result
+ rev64 v4.16b, v5.16b // GHASH final-1 block
+ ldp x6, x7, [x0], #16 // AES final block - load input low & high
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ movi v8.8b, #0 // suppress further partial tag feed in
+ eor x6, x6, x13 // AES final block - round N low
+ mov d22, v4.d[1] // GHASH final-1 block - mid
+ pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
+ eor x7, x7, x14 // AES final block - round N high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
+ ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
+ fmov d5, x6 // AES final block - mov low
+ fmov v5.d[1], x7 // AES final block - mov high
+ pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
+ pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
+ eor v5.16b, v5.16b, v3.16b // AES final block - result
+ eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
+Lenc_blocks_less_than_1: // blocks left <= 1
+ and x1, x1, #127 // bit_length %= 128
+ mvn x13, xzr // rkN_l = 0xffffffffffffffff
+ sub x1, x1, #128 // bit_length -= 128
+ neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
+ ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored
+ mvn x14, xzr // rkN_h = 0xffffffffffffffff
+ and x1, x1, #127 // bit_length %= 128
+ lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
+ cmp x1, #64
+ csel x6, x13, x14, lt
+ csel x7, x14, xzr, lt
+ fmov d0, x6 // ctr0b is mask for last block
+ fmov v0.d[1], x7
+ and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
+ rev64 v4.16b, v5.16b // GHASH final block
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing
+ pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
+ mov d8, v4.d[1] // GHASH final block - mid
+ rev w9, w12
+ pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
+ eor v9.16b, v9.16b, v20.16b // GHASH final block - high
+ eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
+ pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final block - low
+ eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
+ movi v8.8b, #0xc2
+ eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ shl d8, d8, #56 // mod_constant
+ eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ str w9, [x16, #12] // store the updated counter
+ st1 { v5.16b}, [x2] // store all 16B
+ eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ mov x0, x15
+ st1 { v11.16b }, [x3]
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp d8, d9, [sp, #64]
+ ldp d10, d11, [sp, #80]
+ ldp d12, d13, [sp, #96]
+ ldp d14, d15, [sp, #112]
+ ldp x29, x30, [sp], #128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _aes_gcm_dec_kernel
+.private_extern _aes_gcm_dec_kernel
+
+.align 4
+_aes_gcm_dec_kernel:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-128]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp d8, d9, [sp, #64]
+ stp d10, d11, [sp, #80]
+ stp d12, d13, [sp, #96]
+ stp d14, d15, [sp, #112]
+ ldr w17, [x8, #240]
+ add x19, x8, x17, lsl #4 // borrow input_l1 for last key
+ ldp x13, x14, [x19] // load round N keys
+ ldr q31, [x19, #-16] // load round N-1 keys
+ lsr x5, x1, #3 // byte_len
+ mov x15, x5
+ ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
+ ldr q26, [x8, #128] // load rk8
+ sub x5, x5, #1 // byte_len - 1
+ ldr q25, [x8, #112] // load rk7
+ and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ add x4, x0, x1, lsr #3 // end_input_ptr
+ ldr q24, [x8, #96] // load rk6
+ lsr x12, x11, #32
+ ldr q23, [x8, #80] // load rk5
+ orr w11, w11, w11
+ ldr q21, [x8, #48] // load rk3
+ add x5, x5, x0
+ rev w12, w12 // rev_ctr32
+ add w12, w12, #1 // increment rev_ctr32
+ fmov d3, x10 // CTR block 3
+ rev w9, w12 // CTR block 1
+ add w12, w12, #1 // CTR block 1
+ fmov d1, x10 // CTR block 1
+ orr x9, x11, x9, lsl #32 // CTR block 1
+ ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
+ fmov v1.d[1], x9 // CTR block 1
+ rev w9, w12 // CTR block 2
+ add w12, w12, #1 // CTR block 2
+ fmov d2, x10 // CTR block 2
+ orr x9, x11, x9, lsl #32 // CTR block 2
+ fmov v2.d[1], x9 // CTR block 2
+ rev w9, w12 // CTR block 3
+ orr x9, x11, x9, lsl #32 // CTR block 3
+ ldr q18, [x8, #0] // load rk0
+ fmov v3.d[1], x9 // CTR block 3
+ add w12, w12, #1 // CTR block 3
+ ldr q22, [x8, #64] // load rk4
+ ldr q19, [x8, #16] // load rk1
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 0
+ ldr q14, [x6, #48] // load h3l | h3h
+ ext v14.16b, v14.16b, v14.16b, #8
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 0
+ ldr q15, [x6, #80] // load h4l | h4h
+ ext v15.16b, v15.16b, v15.16b, #8
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 0
+ ldr q13, [x6, #32] // load h2l | h2h
+ ext v13.16b, v13.16b, v13.16b, #8
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 0
+ ldr q20, [x8, #32] // load rk2
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 1
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 1
+ ld1 { v11.16b}, [x3]
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 1
+ ldr q27, [x8, #144] // load rk9
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 1
+ ldr q30, [x8, #192] // load rk12
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 2
+ ldr q12, [x6] // load h1l | h1h
+ ext v12.16b, v12.16b, v12.16b, #8
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 2
+ ldr q28, [x8, #160] // load rk10
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 2
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 3
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 2
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 3
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 4
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 3
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 3
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 4
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 4
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 4
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 5
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 5
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 5
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 5
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 6
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 6
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 6
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 6
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 7
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 7
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 8
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 7
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 8
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 8
+ ldr q29, [x8, #176] // load rk11
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 8
+ b.lt Ldec_finish_first_blocks // branch if AES-128
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 9
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 10
+ b.eq Ldec_finish_first_blocks // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+ cmp x0, x5 // check if we have <= 4 blocks
+ trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
+ trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
+ trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
+ trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
+ eor v17.16b, v17.16b, v9.16b // h4k | h3k
+ aese v1.16b, v31.16b // AES block 1 - round N-1
+ aese v2.16b, v31.16b // AES block 2 - round N-1
+ eor v16.16b, v16.16b, v8.16b // h2k | h1k
+ aese v3.16b, v31.16b // AES block 3 - round N-1
+ aese v0.16b, v31.16b // AES block 0 - round N-1
+ b.ge Ldec_tail // handle tail
+
+ ldr q4, [x0, #0] // AES block 0 - load ciphertext
+ ldr q5, [x0, #16] // AES block 1 - load ciphertext
+ rev w9, w12 // CTR block 4
+ eor v0.16b, v4.16b, v0.16b // AES block 0 - result
+ eor v1.16b, v5.16b, v1.16b // AES block 1 - result
+ rev64 v5.16b, v5.16b // GHASH block 1
+ ldr q7, [x0, #48] // AES block 3 - load ciphertext
+ mov x7, v0.d[1] // AES block 0 - mov high
+ mov x6, v0.d[0] // AES block 0 - mov low
+ rev64 v4.16b, v4.16b // GHASH block 0
+ add w12, w12, #1 // CTR block 4
+ fmov d0, x10 // CTR block 4
+ orr x9, x11, x9, lsl #32 // CTR block 4
+ fmov v0.d[1], x9 // CTR block 4
+ rev w9, w12 // CTR block 5
+ add w12, w12, #1 // CTR block 5
+ mov x19, v1.d[0] // AES block 1 - mov low
+ orr x9, x11, x9, lsl #32 // CTR block 5
+ mov x20, v1.d[1] // AES block 1 - mov high
+ eor x7, x7, x14 // AES block 0 - round N high
+ eor x6, x6, x13 // AES block 0 - round N low
+ stp x6, x7, [x2], #16 // AES block 0 - store result
+ fmov d1, x10 // CTR block 5
+ ldr q6, [x0, #32] // AES block 2 - load ciphertext
+ add x0, x0, #64 // AES input_ptr update
+ fmov v1.d[1], x9 // CTR block 5
+ rev w9, w12 // CTR block 6
+ add w12, w12, #1 // CTR block 6
+ eor x19, x19, x13 // AES block 1 - round N low
+ orr x9, x11, x9, lsl #32 // CTR block 6
+ eor x20, x20, x14 // AES block 1 - round N high
+ stp x19, x20, [x2], #16 // AES block 1 - store result
+ eor v2.16b, v6.16b, v2.16b // AES block 2 - result
+ cmp x0, x5 // check if we have <= 8 blocks
+ b.ge Ldec_prepretail // do prepretail
+
+Ldec_main_loop: // main loop start
+ mov x21, v2.d[0] // AES block 4k+2 - mov low
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ mov x22, v2.d[1] // AES block 4k+2 - mov high
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d2, x10 // CTR block 4k+6
+ fmov v2.d[1], x9 // CTR block 4k+6
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev w9, w12 // CTR block 4k+7
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ mov x24, v3.d[1] // AES block 4k+3 - mov high
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ mov x23, v3.d[0] // AES block 4k+3 - mov low
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ fmov d3, x10 // CTR block 4k+7
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ orr x9, x11, x9, lsl #32 // CTR block 4k+7
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ fmov v3.d[1], x9 // CTR block 4k+7
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ eor x22, x22, x14 // AES block 4k+2 - round N high
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ rev64 v6.16b, v6.16b // GHASH block 4k+2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ eor x21, x21, x13 // AES block 4k+2 - round N low
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ stp x21, x22, [x2], #16 // AES block 4k+2 - store result
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ rev64 v7.16b, v7.16b // GHASH block 4k+3
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ eor x23, x23, x13 // AES block 4k+3 - round N low
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ eor x24, x24, x14 // AES block 4k+3 - round N high
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ add w12, w12, #1 // CTR block 4k+7
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ rev w9, w12 // CTR block 4k+8
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ add w12, w12, #1 // CTR block 4k+8
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ mov d6, v7.d[1] // GHASH block 4k+3 - mid
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+8
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
+ movi v8.8b, #0xc2
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ shl d8, d8, #56 // mod_constant
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ b.lt Ldec_main_loop_continue // branch if AES-128
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ b.eq Ldec_main_loop_continue // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext
+ eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ stp x23, x24, [x2], #16 // AES block 4k+3 - store result
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext
+ ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext
+ mov x7, v0.d[1] // AES block 4k+4 - mov high
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ add x0, x0, #64 // AES input_ptr update
+ mov x6, v0.d[0] // AES block 4k+4 - mov low
+ fmov d0, x10 // CTR block 4k+8
+ fmov v0.d[1], x9 // CTR block 4k+8
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result
+ rev w9, w12 // CTR block 4k+9
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ orr x9, x11, x9, lsl #32 // CTR block 4k+9
+ cmp x0, x5 // LOOP CONTROL
+ add w12, w12, #1 // CTR block 4k+9
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ mov x20, v1.d[1] // AES block 4k+5 - mov high
+ eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ mov x19, v1.d[0] // AES block 4k+5 - mov low
+ fmov d1, x10 // CTR block 4k+9
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ fmov v1.d[1], x9 // CTR block 4k+9
+ rev w9, w12 // CTR block 4k+10
+ add w12, w12, #1 // CTR block 4k+10
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ orr x9, x11, x9, lsl #32 // CTR block 4k+10
+ rev64 v5.16b, v5.16b // GHASH block 4k+5
+ eor x20, x20, x14 // AES block 4k+5 - round N high
+ stp x6, x7, [x2], #16 // AES block 4k+4 - store result
+ eor x19, x19, x13 // AES block 4k+5 - round N low
+ stp x19, x20, [x2], #16 // AES block 4k+5 - store result
+ rev64 v4.16b, v4.16b // GHASH block 4k+4
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ b.lt Ldec_main_loop
+
+Ldec_prepretail: // PREPRETAIL
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ mov x21, v2.d[0] // AES block 4k+2 - mov low
+ eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ mov x22, v2.d[1] // AES block 4k+2 - mov high
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d2, x10 // CTR block 4k+6
+ fmov v2.d[1], x9 // CTR block 4k+6
+ rev w9, w12 // CTR block 4k+7
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev64 v6.16b, v6.16b // GHASH block 4k+2
+ orr x9, x11, x9, lsl #32 // CTR block 4k+7
+ mov x23, v3.d[0] // AES block 4k+3 - mov low
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ mov x24, v3.d[1] // AES block 4k+3 - mov high
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ fmov d3, x10 // CTR block 4k+7
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ fmov v3.d[1], x9 // CTR block 4k+7
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ rev64 v7.16b, v7.16b // GHASH block 4k+3
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ mov d6, v7.d[1] // GHASH block 4k+3 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ movi v8.8b, #0xc2
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
+ pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ shl d8, d8, #56 // mod_constant
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ b.lt Ldec_finish_prepretail // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ b.eq Ldec_finish_prepretail // branch if AES-192
+
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor x22, x22, x14 // AES block 4k+2 - round N high
+ eor x23, x23, x13 // AES block 4k+3 - round N low
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ add w12, w12, #1 // CTR block 4k+7
+ eor x21, x21, x13 // AES block 4k+2 - round N low
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor x24, x24, x14 // AES block 4k+3 - round N high
+ stp x21, x22, [x2], #16 // AES block 4k+2 - store result
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ stp x23, x24, [x2], #16 // AES block 4k+3 - store result
+
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+
+Ldec_tail: // TAIL
+ sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
+ ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext
+ eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result
+ mov x6, v0.d[0] // AES block 4k+4 - mov low
+ mov x7, v0.d[1] // AES block 4k+4 - mov high
+ ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
+ cmp x5, #48
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ b.gt Ldec_blocks_more_than_3
+ sub w12, w12, #1
+ mov v3.16b, v2.16b
+ movi v10.8b, #0
+ movi v11.8b, #0
+ cmp x5, #32
+ movi v9.8b, #0
+ mov v2.16b, v1.16b
+ b.gt Ldec_blocks_more_than_2
+ sub w12, w12, #1
+ mov v3.16b, v1.16b
+ cmp x5, #16
+ b.gt Ldec_blocks_more_than_1
+ sub w12, w12, #1
+ b Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3: // blocks left > 3
+ rev64 v4.16b, v5.16b // GHASH final-3 block
+ ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext
+ stp x6, x7, [x2], #16 // AES final-3 block - store result
+ mov d10, v17.d[1] // GHASH final-3 block - mid
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ eor v0.16b, v5.16b, v1.16b // AES final-2 block - result
+ mov d22, v4.d[1] // GHASH final-3 block - mid
+ mov x6, v0.d[0] // AES final-2 block - mov low
+ mov x7, v0.d[1] // AES final-2 block - mov high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
+ pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
+ eor x6, x6, x13 // AES final-2 block - round N low
+ pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
+ eor x7, x7, x14 // AES final-2 block - round N high
+Ldec_blocks_more_than_2: // blocks left > 2
+ rev64 v4.16b, v5.16b // GHASH final-2 block
+ ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ stp x6, x7, [x2], #16 // AES final-2 block - store result
+ eor v0.16b, v5.16b, v2.16b // AES final-1 block - result
+ mov d22, v4.d[1] // GHASH final-2 block - mid
+ pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
+ pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
+ mov x6, v0.d[0] // AES final-1 block - mov low
+ mov x7, v0.d[1] // AES final-1 block - mov high
+ eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
+ eor x6, x6, x13 // AES final-1 block - round N low
+ eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
+ eor x7, x7, x14 // AES final-1 block - round N high
+Ldec_blocks_more_than_1: // blocks left > 1
+ stp x6, x7, [x2], #16 // AES final-1 block - store result
+ rev64 v4.16b, v5.16b // GHASH final-1 block
+ ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ movi v8.8b, #0 // suppress further partial tag feed in
+ mov d22, v4.d[1] // GHASH final-1 block - mid
+ eor v0.16b, v5.16b, v3.16b // AES final block - result
+ pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
+ pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
+ mov x6, v0.d[0] // AES final block - mov low
+ ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
+ mov x7, v0.d[1] // AES final block - mov high
+ pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
+ eor x6, x6, x13 // AES final block - round N low
+ eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
+ eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
+ eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
+ eor x7, x7, x14 // AES final block - round N high
+Ldec_blocks_less_than_1: // blocks left <= 1
+ and x1, x1, #127 // bit_length %= 128
+ mvn x14, xzr // rkN_h = 0xffffffffffffffff
+ sub x1, x1, #128 // bit_length -= 128
+ mvn x13, xzr // rkN_l = 0xffffffffffffffff
+ ldp x4, x5, [x2] // load existing bytes we need to not overwrite
+ neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
+ and x1, x1, #127 // bit_length %= 128
+ lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
+ cmp x1, #64
+ csel x9, x13, x14, lt
+ csel x10, x14, xzr, lt
+ fmov d0, x9 // ctr0b is mask for last block
+ and x6, x6, x9
+ mov v0.d[1], x10
+ bic x4, x4, x9 // mask out low existing bytes
+ rev w9, w12
+ bic x5, x5, x10 // mask out high existing bytes
+ orr x6, x6, x4
+ and x7, x7, x10
+ orr x7, x7, x5
+ and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
+ rev64 v4.16b, v5.16b // GHASH final block
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
+ mov d8, v4.d[1] // GHASH final block - mid
+ eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
+ pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
+ pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final block - high
+ eor v11.16b, v11.16b, v21.16b // GHASH final block - low
+ eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
+ movi v8.8b, #0xc2
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ shl d8, d8, #56 // mod_constant
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ stp x6, x7, [x2]
+ str w9, [x16, #12] // store the updated counter
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ mov x0, x15
+ st1 { v11.16b }, [x3]
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp d8, d9, [sp, #64]
+ ldp d10, d11, [sp, #80]
+ ldp d12, d13, [sp, #96]
+ ldp d14, d15, [sp, #112]
+ ldp x29, x30, [sp], #128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/aesv8-gcm-armv8-linux.S b/gen/bcm/aesv8-gcm-armv8-linux.S
new file mode 100644
index 0000000..4283f93
--- /dev/null
+++ b/gen/bcm/aesv8-gcm-armv8-linux.S
@@ -0,0 +1,1555 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch armv8-a+crypto
+.text
+.globl aes_gcm_enc_kernel
+.hidden aes_gcm_enc_kernel
+.type aes_gcm_enc_kernel,%function
+.align 4
+aes_gcm_enc_kernel:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-128]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp d8, d9, [sp, #64]
+ stp d10, d11, [sp, #80]
+ stp d12, d13, [sp, #96]
+ stp d14, d15, [sp, #112]
+ ldr w17, [x8, #240]
+ add x19, x8, x17, lsl #4 // borrow input_l1 for last key
+ ldp x13, x14, [x19] // load round N keys
+ ldr q31, [x19, #-16] // load round N-1 keys
+ add x4, x0, x1, lsr #3 // end_input_ptr
+ lsr x5, x1, #3 // byte_len
+ mov x15, x5
+ ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
+ ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
+ sub x5, x5, #1 // byte_len - 1
+ ldr q18, [x8, #0] // load rk0
+ and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ ldr q25, [x8, #112] // load rk7
+ add x5, x5, x0
+ lsr x12, x11, #32
+ fmov d2, x10 // CTR block 2
+ orr w11, w11, w11
+ rev w12, w12 // rev_ctr32
+ fmov d1, x10 // CTR block 1
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 0
+ add w12, w12, #1 // increment rev_ctr32
+ rev w9, w12 // CTR block 1
+ fmov d3, x10 // CTR block 3
+ orr x9, x11, x9, lsl #32 // CTR block 1
+ add w12, w12, #1 // CTR block 1
+ ldr q19, [x8, #16] // load rk1
+ fmov v1.d[1], x9 // CTR block 1
+ rev w9, w12 // CTR block 2
+ add w12, w12, #1 // CTR block 2
+ orr x9, x11, x9, lsl #32 // CTR block 2
+ ldr q20, [x8, #32] // load rk2
+ fmov v2.d[1], x9 // CTR block 2
+ rev w9, w12 // CTR block 3
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 1
+ orr x9, x11, x9, lsl #32 // CTR block 3
+ fmov v3.d[1], x9 // CTR block 3
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 0
+ ldr q21, [x8, #48] // load rk3
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 2
+ ldr q24, [x8, #96] // load rk6
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 0
+ ldr q23, [x8, #80] // load rk5
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 1
+ ldr q14, [x6, #48] // load h3l | h3h
+ ext v14.16b, v14.16b, v14.16b, #8
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 0
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 1
+ ldr q22, [x8, #64] // load rk4
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 2
+ ldr q13, [x6, #32] // load h2l | h2h
+ ext v13.16b, v13.16b, v13.16b, #8
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 1
+ ldr q30, [x8, #192] // load rk12
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 2
+ ldr q15, [x6, #80] // load h4l | h4h
+ ext v15.16b, v15.16b, v15.16b, #8
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 3
+ ldr q29, [x8, #176] // load rk11
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 2
+ ldr q26, [x8, #128] // load rk8
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 3
+ add w12, w12, #1 // CTR block 3
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 3
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 3
+ ld1 { v11.16b}, [x3]
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 4
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 4
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 4
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 4
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 5
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 5
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 5
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 6
+ trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 6
+ ldr q27, [x8, #144] // load rk9
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 6
+ ldr q12, [x6] // load h1l | h1h
+ ext v12.16b, v12.16b, v12.16b, #8
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 6
+ ldr q28, [x8, #160] // load rk10
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 7
+ trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 7
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 7
+ trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 8
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 8
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 8
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 8
+ b.lt .Lenc_finish_first_blocks // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 9
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 10
+ b.eq .Lenc_finish_first_blocks // branch if AES-192
+
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 12
+
+.Lenc_finish_first_blocks:
+ cmp x0, x5 // check if we have <= 4 blocks
+ eor v17.16b, v17.16b, v9.16b // h4k | h3k
+ aese v2.16b, v31.16b // AES block 2 - round N-1
+ trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
+ aese v1.16b, v31.16b // AES block 1 - round N-1
+ aese v0.16b, v31.16b // AES block 0 - round N-1
+ aese v3.16b, v31.16b // AES block 3 - round N-1
+ eor v16.16b, v16.16b, v8.16b // h2k | h1k
+ b.ge .Lenc_tail // handle tail
+
+ ldp x19, x20, [x0, #16] // AES block 1 - load plaintext
+ rev w9, w12 // CTR block 4
+ ldp x6, x7, [x0, #0] // AES block 0 - load plaintext
+ ldp x23, x24, [x0, #48] // AES block 3 - load plaintext
+ ldp x21, x22, [x0, #32] // AES block 2 - load plaintext
+ add x0, x0, #64 // AES input_ptr update
+ eor x19, x19, x13 // AES block 1 - round N low
+ eor x20, x20, x14 // AES block 1 - round N high
+ fmov d5, x19 // AES block 1 - mov low
+ eor x6, x6, x13 // AES block 0 - round N low
+ eor x7, x7, x14 // AES block 0 - round N high
+ eor x24, x24, x14 // AES block 3 - round N high
+ fmov d4, x6 // AES block 0 - mov low
+ cmp x0, x5 // check if we have <= 8 blocks
+ fmov v4.d[1], x7 // AES block 0 - mov high
+ eor x23, x23, x13 // AES block 3 - round N low
+ eor x21, x21, x13 // AES block 2 - round N low
+ fmov v5.d[1], x20 // AES block 1 - mov high
+ fmov d6, x21 // AES block 2 - mov low
+ add w12, w12, #1 // CTR block 4
+ orr x9, x11, x9, lsl #32 // CTR block 4
+ fmov d7, x23 // AES block 3 - mov low
+ eor x22, x22, x14 // AES block 2 - round N high
+ fmov v6.d[1], x22 // AES block 2 - mov high
+ eor v4.16b, v4.16b, v0.16b // AES block 0 - result
+ fmov d0, x10 // CTR block 4
+ fmov v0.d[1], x9 // CTR block 4
+ rev w9, w12 // CTR block 5
+ add w12, w12, #1 // CTR block 5
+ eor v5.16b, v5.16b, v1.16b // AES block 1 - result
+ fmov d1, x10 // CTR block 5
+ orr x9, x11, x9, lsl #32 // CTR block 5
+ fmov v1.d[1], x9 // CTR block 5
+ rev w9, w12 // CTR block 6
+ st1 { v4.16b}, [x2], #16 // AES block 0 - store result
+ fmov v7.d[1], x24 // AES block 3 - mov high
+ orr x9, x11, x9, lsl #32 // CTR block 6
+ eor v6.16b, v6.16b, v2.16b // AES block 2 - result
+ st1 { v5.16b}, [x2], #16 // AES block 1 - store result
+ add w12, w12, #1 // CTR block 6
+ fmov d2, x10 // CTR block 6
+ fmov v2.d[1], x9 // CTR block 6
+ st1 { v6.16b}, [x2], #16 // AES block 2 - store result
+ rev w9, w12 // CTR block 7
+ orr x9, x11, x9, lsl #32 // CTR block 7
+ eor v7.16b, v7.16b, v3.16b // AES block 3 - result
+ st1 { v7.16b}, [x2], #16 // AES block 3 - store result
+ b.ge .Lenc_prepretail // do prepretail
+
+.Lenc_main_loop: // main loop start
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d3, x10 // CTR block 4k+3
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ fmov v3.d[1], x9 // CTR block 4k+3
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ eor x23, x23, x13 // AES block 4k+7 - round N low
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ eor x22, x22, x14 // AES block 4k+6 - round N high
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ mov d4, v7.d[1] // GHASH block 4k+3 - mid
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor x19, x19, x13 // AES block 4k+5 - round N low
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ eor x21, x21, x13 // AES block 4k+6 - round N low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ movi v8.8b, #0xc2
+ pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ fmov d5, x19 // AES block 4k+5 - mov low
+ ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext
+ b.lt .Lenc_main_loop_continue // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ b.eq .Lenc_main_loop_continue // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+.Lenc_main_loop_continue:
+ shl d8, d8, #56 // mod_constant
+ eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
+ add w12, w12, #1 // CTR block 4k+3
+ eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ add x0, x0, #64 // AES input_ptr update
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ rev w9, w12 // CTR block 4k+8
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ fmov d4, x6 // AES block 4k+4 - mov low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+8
+ eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid
+ eor x20, x20, x14 // AES block 4k+5 - round N high
+ eor x24, x24, x14 // AES block 4k+7 - round N high
+ add w12, w12, #1 // CTR block 4k+8
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ fmov v4.d[1], x7 // AES block 4k+4 - mov high
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ fmov d7, x23 // AES block 4k+7 - mov low
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ fmov v5.d[1], x20 // AES block 4k+5 - mov high
+ fmov d6, x21 // AES block 4k+6 - mov low
+ cmp x0, x5 // .LOOP CONTROL
+ fmov v6.d[1], x22 // AES block 4k+6 - mov high
+ pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ fmov d0, x10 // CTR block 4k+8
+ fmov v0.d[1], x9 // CTR block 4k+8
+ rev w9, w12 // CTR block 4k+9
+ add w12, w12, #1 // CTR block 4k+9
+ eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result
+ fmov d1, x10 // CTR block 4k+9
+ orr x9, x11, x9, lsl #32 // CTR block 4k+9
+ fmov v1.d[1], x9 // CTR block 4k+9
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ rev w9, w12 // CTR block 4k+10
+ st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result
+ orr x9, x11, x9, lsl #32 // CTR block 4k+10
+ eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
+ fmov v7.d[1], x24 // AES block 4k+7 - mov high
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result
+ add w12, w12, #1 // CTR block 4k+10
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result
+ fmov d2, x10 // CTR block 4k+10
+ st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result
+ fmov v2.d[1], x9 // CTR block 4k+10
+ rev w9, w12 // CTR block 4k+11
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+11
+ eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result
+ st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result
+ b.lt .Lenc_main_loop
+
+.Lenc_prepretail: // PREPRETAIL
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ fmov d3, x10 // CTR block 4k+3
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
+ fmov v3.d[1], x9 // CTR block 4k+3
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ add w12, w12, #1 // CTR block 4k+3
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ mov d4, v7.d[1] // GHASH block 4k+3 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ movi v8.8b, #0xc2
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ shl d8, d8, #56 // mod_constant
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
+ pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v10.16b, v10.16b, v9.16b // karatsuba tidy up
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ pmull v4.1q, v9.1d, v8.1d
+ ext v9.16b, v9.16b, v9.16b, #8
+ eor v10.16b, v10.16b, v11.16b
+ b.lt .Lenc_finish_prepretail // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ b.eq .Lenc_finish_prepretail // branch if AES-192
+
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+
+.Lenc_finish_prepretail:
+ eor v10.16b, v10.16b, v4.16b
+ eor v10.16b, v10.16b, v9.16b
+ pmull v4.1q, v10.1d, v8.1d
+ ext v10.16b, v10.16b, v10.16b, #8
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ eor v11.16b, v11.16b, v4.16b
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ eor v11.16b, v11.16b, v10.16b
+
+.Lenc_tail: // TAIL
+ ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
+ sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
+ ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ cmp x5, #48
+ fmov d4, x6 // AES block 4k+4 - mov low
+ fmov v4.d[1], x7 // AES block 4k+4 - mov high
+ eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ b.gt .Lenc_blocks_more_than_3
+ cmp x5, #32
+ mov v3.16b, v2.16b
+ movi v11.8b, #0
+ movi v9.8b, #0
+ sub w12, w12, #1
+ mov v2.16b, v1.16b
+ movi v10.8b, #0
+ b.gt .Lenc_blocks_more_than_2
+ mov v3.16b, v1.16b
+ sub w12, w12, #1
+ cmp x5, #16
+ b.gt .Lenc_blocks_more_than_1
+ sub w12, w12, #1
+ b .Lenc_blocks_less_than_1
+.Lenc_blocks_more_than_3: // blocks left > 3
+ st1 { v5.16b}, [x2], #16 // AES final-3 block - store result
+ ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high
+ rev64 v4.16b, v5.16b // GHASH final-3 block
+ eor x6, x6, x13 // AES final-2 block - round N low
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ eor x7, x7, x14 // AES final-2 block - round N high
+ mov d22, v4.d[1] // GHASH final-3 block - mid
+ fmov d5, x6 // AES final-2 block - mov low
+ fmov v5.d[1], x7 // AES final-2 block - mov high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
+ movi v8.8b, #0 // suppress further partial tag feed in
+ mov d10, v17.d[1] // GHASH final-3 block - mid
+ pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
+ pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
+ eor v5.16b, v5.16b, v1.16b // AES final-2 block - result
+.Lenc_blocks_more_than_2: // blocks left > 2
+ st1 { v5.16b}, [x2], #16 // AES final-2 block - store result
+ ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high
+ rev64 v4.16b, v5.16b // GHASH final-2 block
+ eor x6, x6, x13 // AES final-1 block - round N low
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ fmov d5, x6 // AES final-1 block - mov low
+ eor x7, x7, x14 // AES final-1 block - round N high
+ fmov v5.d[1], x7 // AES final-1 block - mov high
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
+ mov d22, v4.d[1] // GHASH final-2 block - mid
+ pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
+ eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
+ eor v5.16b, v5.16b, v2.16b // AES final-1 block - result
+ eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
+ pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
+ eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
+.Lenc_blocks_more_than_1: // blocks left > 1
+ st1 { v5.16b}, [x2], #16 // AES final-1 block - store result
+ rev64 v4.16b, v5.16b // GHASH final-1 block
+ ldp x6, x7, [x0], #16 // AES final block - load input low & high
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ movi v8.8b, #0 // suppress further partial tag feed in
+ eor x6, x6, x13 // AES final block - round N low
+ mov d22, v4.d[1] // GHASH final-1 block - mid
+ pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
+ eor x7, x7, x14 // AES final block - round N high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
+ ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
+ fmov d5, x6 // AES final block - mov low
+ fmov v5.d[1], x7 // AES final block - mov high
+ pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
+ pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
+ eor v5.16b, v5.16b, v3.16b // AES final block - result
+ eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
+.Lenc_blocks_less_than_1: // blocks left <= 1
+ and x1, x1, #127 // bit_length %= 128
+ mvn x13, xzr // rkN_l = 0xffffffffffffffff
+ sub x1, x1, #128 // bit_length -= 128
+ neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
+ ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored
+ mvn x14, xzr // rkN_h = 0xffffffffffffffff
+ and x1, x1, #127 // bit_length %= 128
+ lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
+ cmp x1, #64
+ csel x6, x13, x14, lt
+ csel x7, x14, xzr, lt
+ fmov d0, x6 // ctr0b is mask for last block
+ fmov v0.d[1], x7
+ and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
+ rev64 v4.16b, v5.16b // GHASH final block
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing
+ pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
+ mov d8, v4.d[1] // GHASH final block - mid
+ rev w9, w12
+ pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
+ eor v9.16b, v9.16b, v20.16b // GHASH final block - high
+ eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
+ pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final block - low
+ eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
+ movi v8.8b, #0xc2
+ eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ shl d8, d8, #56 // mod_constant
+ eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ str w9, [x16, #12] // store the updated counter
+ st1 { v5.16b}, [x2] // store all 16B
+ eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ mov x0, x15
+ st1 { v11.16b }, [x3]
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp d8, d9, [sp, #64]
+ ldp d10, d11, [sp, #80]
+ ldp d12, d13, [sp, #96]
+ ldp d14, d15, [sp, #112]
+ ldp x29, x30, [sp], #128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
+.globl aes_gcm_dec_kernel
+.hidden aes_gcm_dec_kernel
+.type aes_gcm_dec_kernel,%function
+.align 4
+aes_gcm_dec_kernel:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-128]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp d8, d9, [sp, #64]
+ stp d10, d11, [sp, #80]
+ stp d12, d13, [sp, #96]
+ stp d14, d15, [sp, #112]
+ ldr w17, [x8, #240]
+ add x19, x8, x17, lsl #4 // borrow input_l1 for last key
+ ldp x13, x14, [x19] // load round N keys
+ ldr q31, [x19, #-16] // load round N-1 keys
+ lsr x5, x1, #3 // byte_len
+ mov x15, x5
+ ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
+ ldr q26, [x8, #128] // load rk8
+ sub x5, x5, #1 // byte_len - 1
+ ldr q25, [x8, #112] // load rk7
+ and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ add x4, x0, x1, lsr #3 // end_input_ptr
+ ldr q24, [x8, #96] // load rk6
+ lsr x12, x11, #32
+ ldr q23, [x8, #80] // load rk5
+ orr w11, w11, w11
+ ldr q21, [x8, #48] // load rk3
+ add x5, x5, x0
+ rev w12, w12 // rev_ctr32
+ add w12, w12, #1 // increment rev_ctr32
+ fmov d3, x10 // CTR block 3
+ rev w9, w12 // CTR block 1
+ add w12, w12, #1 // CTR block 1
+ fmov d1, x10 // CTR block 1
+ orr x9, x11, x9, lsl #32 // CTR block 1
+ ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
+ fmov v1.d[1], x9 // CTR block 1
+ rev w9, w12 // CTR block 2
+ add w12, w12, #1 // CTR block 2
+ fmov d2, x10 // CTR block 2
+ orr x9, x11, x9, lsl #32 // CTR block 2
+ fmov v2.d[1], x9 // CTR block 2
+ rev w9, w12 // CTR block 3
+ orr x9, x11, x9, lsl #32 // CTR block 3
+ ldr q18, [x8, #0] // load rk0
+ fmov v3.d[1], x9 // CTR block 3
+ add w12, w12, #1 // CTR block 3
+ ldr q22, [x8, #64] // load rk4
+ ldr q19, [x8, #16] // load rk1
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 0
+ ldr q14, [x6, #48] // load h3l | h3h
+ ext v14.16b, v14.16b, v14.16b, #8
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 0
+ ldr q15, [x6, #80] // load h4l | h4h
+ ext v15.16b, v15.16b, v15.16b, #8
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 0
+ ldr q13, [x6, #32] // load h2l | h2h
+ ext v13.16b, v13.16b, v13.16b, #8
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 0
+ ldr q20, [x8, #32] // load rk2
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 1
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 1
+ ld1 { v11.16b}, [x3]
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 1
+ ldr q27, [x8, #144] // load rk9
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 1
+ ldr q30, [x8, #192] // load rk12
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 2
+ ldr q12, [x6] // load h1l | h1h
+ ext v12.16b, v12.16b, v12.16b, #8
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 2
+ ldr q28, [x8, #160] // load rk10
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 2
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 3
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 2
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 3
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 4
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 3
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 3
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 4
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 4
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 4
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 5
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 5
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 5
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 5
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 6
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 6
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 6
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 6
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 7
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 7
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 8
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 7
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 8
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 8
+ ldr q29, [x8, #176] // load rk11
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 8
+ b.lt .Ldec_finish_first_blocks // branch if AES-128
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 9
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 10
+ b.eq .Ldec_finish_first_blocks // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 12
+
+.Ldec_finish_first_blocks:
+ cmp x0, x5 // check if we have <= 4 blocks
+ trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
+ trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
+ trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
+ trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
+ eor v17.16b, v17.16b, v9.16b // h4k | h3k
+ aese v1.16b, v31.16b // AES block 1 - round N-1
+ aese v2.16b, v31.16b // AES block 2 - round N-1
+ eor v16.16b, v16.16b, v8.16b // h2k | h1k
+ aese v3.16b, v31.16b // AES block 3 - round N-1
+ aese v0.16b, v31.16b // AES block 0 - round N-1
+ b.ge .Ldec_tail // handle tail
+
+ ldr q4, [x0, #0] // AES block 0 - load ciphertext
+ ldr q5, [x0, #16] // AES block 1 - load ciphertext
+ rev w9, w12 // CTR block 4
+ eor v0.16b, v4.16b, v0.16b // AES block 0 - result
+ eor v1.16b, v5.16b, v1.16b // AES block 1 - result
+ rev64 v5.16b, v5.16b // GHASH block 1
+ ldr q7, [x0, #48] // AES block 3 - load ciphertext
+ mov x7, v0.d[1] // AES block 0 - mov high
+ mov x6, v0.d[0] // AES block 0 - mov low
+ rev64 v4.16b, v4.16b // GHASH block 0
+ add w12, w12, #1 // CTR block 4
+ fmov d0, x10 // CTR block 4
+ orr x9, x11, x9, lsl #32 // CTR block 4
+ fmov v0.d[1], x9 // CTR block 4
+ rev w9, w12 // CTR block 5
+ add w12, w12, #1 // CTR block 5
+ mov x19, v1.d[0] // AES block 1 - mov low
+ orr x9, x11, x9, lsl #32 // CTR block 5
+ mov x20, v1.d[1] // AES block 1 - mov high
+ eor x7, x7, x14 // AES block 0 - round N high
+ eor x6, x6, x13 // AES block 0 - round N low
+ stp x6, x7, [x2], #16 // AES block 0 - store result
+ fmov d1, x10 // CTR block 5
+ ldr q6, [x0, #32] // AES block 2 - load ciphertext
+ add x0, x0, #64 // AES input_ptr update
+ fmov v1.d[1], x9 // CTR block 5
+ rev w9, w12 // CTR block 6
+ add w12, w12, #1 // CTR block 6
+ eor x19, x19, x13 // AES block 1 - round N low
+ orr x9, x11, x9, lsl #32 // CTR block 6
+ eor x20, x20, x14 // AES block 1 - round N high
+ stp x19, x20, [x2], #16 // AES block 1 - store result
+ eor v2.16b, v6.16b, v2.16b // AES block 2 - result
+ cmp x0, x5 // check if we have <= 8 blocks
+ b.ge .Ldec_prepretail // do prepretail
+
+.Ldec_main_loop: // main loop start
+ mov x21, v2.d[0] // AES block 4k+2 - mov low
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ mov x22, v2.d[1] // AES block 4k+2 - mov high
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d2, x10 // CTR block 4k+6
+ fmov v2.d[1], x9 // CTR block 4k+6
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev w9, w12 // CTR block 4k+7
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ mov x24, v3.d[1] // AES block 4k+3 - mov high
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ mov x23, v3.d[0] // AES block 4k+3 - mov low
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ fmov d3, x10 // CTR block 4k+7
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ orr x9, x11, x9, lsl #32 // CTR block 4k+7
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ fmov v3.d[1], x9 // CTR block 4k+7
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ eor x22, x22, x14 // AES block 4k+2 - round N high
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ rev64 v6.16b, v6.16b // GHASH block 4k+2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ eor x21, x21, x13 // AES block 4k+2 - round N low
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ stp x21, x22, [x2], #16 // AES block 4k+2 - store result
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ rev64 v7.16b, v7.16b // GHASH block 4k+3
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ eor x23, x23, x13 // AES block 4k+3 - round N low
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ eor x24, x24, x14 // AES block 4k+3 - round N high
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ add w12, w12, #1 // CTR block 4k+7
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ rev w9, w12 // CTR block 4k+8
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ add w12, w12, #1 // CTR block 4k+8
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ mov d6, v7.d[1] // GHASH block 4k+3 - mid
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+8
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
+ movi v8.8b, #0xc2
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ shl d8, d8, #56 // mod_constant
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ b.lt .Ldec_main_loop_continue // branch if AES-128
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ b.eq .Ldec_main_loop_continue // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+.Ldec_main_loop_continue:
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext
+ eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ stp x23, x24, [x2], #16 // AES block 4k+3 - store result
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext
+ ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext
+ mov x7, v0.d[1] // AES block 4k+4 - mov high
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ add x0, x0, #64 // AES input_ptr update
+ mov x6, v0.d[0] // AES block 4k+4 - mov low
+ fmov d0, x10 // CTR block 4k+8
+ fmov v0.d[1], x9 // CTR block 4k+8
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result
+ rev w9, w12 // CTR block 4k+9
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ orr x9, x11, x9, lsl #32 // CTR block 4k+9
+ cmp x0, x5 // .LOOP CONTROL
+ add w12, w12, #1 // CTR block 4k+9
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ mov x20, v1.d[1] // AES block 4k+5 - mov high
+ eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ mov x19, v1.d[0] // AES block 4k+5 - mov low
+ fmov d1, x10 // CTR block 4k+9
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ fmov v1.d[1], x9 // CTR block 4k+9
+ rev w9, w12 // CTR block 4k+10
+ add w12, w12, #1 // CTR block 4k+10
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ orr x9, x11, x9, lsl #32 // CTR block 4k+10
+ rev64 v5.16b, v5.16b // GHASH block 4k+5
+ eor x20, x20, x14 // AES block 4k+5 - round N high
+ stp x6, x7, [x2], #16 // AES block 4k+4 - store result
+ eor x19, x19, x13 // AES block 4k+5 - round N low
+ stp x19, x20, [x2], #16 // AES block 4k+5 - store result
+ rev64 v4.16b, v4.16b // GHASH block 4k+4
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ b.lt .Ldec_main_loop
+
+.Ldec_prepretail: // PREPRETAIL
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ mov x21, v2.d[0] // AES block 4k+2 - mov low
+ eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ mov x22, v2.d[1] // AES block 4k+2 - mov high
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d2, x10 // CTR block 4k+6
+ fmov v2.d[1], x9 // CTR block 4k+6
+ rev w9, w12 // CTR block 4k+7
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev64 v6.16b, v6.16b // GHASH block 4k+2
+ orr x9, x11, x9, lsl #32 // CTR block 4k+7
+ mov x23, v3.d[0] // AES block 4k+3 - mov low
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ mov x24, v3.d[1] // AES block 4k+3 - mov high
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ fmov d3, x10 // CTR block 4k+7
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ fmov v3.d[1], x9 // CTR block 4k+7
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ rev64 v7.16b, v7.16b // GHASH block 4k+3
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ mov d6, v7.d[1] // GHASH block 4k+3 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ movi v8.8b, #0xc2
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
+ pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ shl d8, d8, #56 // mod_constant
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ b.lt .Ldec_finish_prepretail // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ b.eq .Ldec_finish_prepretail // branch if AES-192
+
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+.Ldec_finish_prepretail:
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor x22, x22, x14 // AES block 4k+2 - round N high
+ eor x23, x23, x13 // AES block 4k+3 - round N low
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ add w12, w12, #1 // CTR block 4k+7
+ eor x21, x21, x13 // AES block 4k+2 - round N low
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor x24, x24, x14 // AES block 4k+3 - round N high
+ stp x21, x22, [x2], #16 // AES block 4k+2 - store result
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ stp x23, x24, [x2], #16 // AES block 4k+3 - store result
+
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+
+.Ldec_tail: // TAIL
+ sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
+ ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext
+ eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result
+ mov x6, v0.d[0] // AES block 4k+4 - mov low
+ mov x7, v0.d[1] // AES block 4k+4 - mov high
+ ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
+ cmp x5, #48
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ b.gt .Ldec_blocks_more_than_3
+ sub w12, w12, #1
+ mov v3.16b, v2.16b
+ movi v10.8b, #0
+ movi v11.8b, #0
+ cmp x5, #32
+ movi v9.8b, #0
+ mov v2.16b, v1.16b
+ b.gt .Ldec_blocks_more_than_2
+ sub w12, w12, #1
+ mov v3.16b, v1.16b
+ cmp x5, #16
+ b.gt .Ldec_blocks_more_than_1
+ sub w12, w12, #1
+ b .Ldec_blocks_less_than_1
+.Ldec_blocks_more_than_3: // blocks left > 3
+ rev64 v4.16b, v5.16b // GHASH final-3 block
+ ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext
+ stp x6, x7, [x2], #16 // AES final-3 block - store result
+ mov d10, v17.d[1] // GHASH final-3 block - mid
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ eor v0.16b, v5.16b, v1.16b // AES final-2 block - result
+ mov d22, v4.d[1] // GHASH final-3 block - mid
+ mov x6, v0.d[0] // AES final-2 block - mov low
+ mov x7, v0.d[1] // AES final-2 block - mov high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
+ pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
+ eor x6, x6, x13 // AES final-2 block - round N low
+ pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
+ eor x7, x7, x14 // AES final-2 block - round N high
+.Ldec_blocks_more_than_2: // blocks left > 2
+ rev64 v4.16b, v5.16b // GHASH final-2 block
+ ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ stp x6, x7, [x2], #16 // AES final-2 block - store result
+ eor v0.16b, v5.16b, v2.16b // AES final-1 block - result
+ mov d22, v4.d[1] // GHASH final-2 block - mid
+ pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
+ pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
+ mov x6, v0.d[0] // AES final-1 block - mov low
+ mov x7, v0.d[1] // AES final-1 block - mov high
+ eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
+ eor x6, x6, x13 // AES final-1 block - round N low
+ eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
+ eor x7, x7, x14 // AES final-1 block - round N high
+.Ldec_blocks_more_than_1: // blocks left > 1
+ stp x6, x7, [x2], #16 // AES final-1 block - store result
+ rev64 v4.16b, v5.16b // GHASH final-1 block
+ ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ movi v8.8b, #0 // suppress further partial tag feed in
+ mov d22, v4.d[1] // GHASH final-1 block - mid
+ eor v0.16b, v5.16b, v3.16b // AES final block - result
+ pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
+ pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
+ mov x6, v0.d[0] // AES final block - mov low
+ ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
+ mov x7, v0.d[1] // AES final block - mov high
+ pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
+ eor x6, x6, x13 // AES final block - round N low
+ eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
+ eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
+ eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
+ eor x7, x7, x14 // AES final block - round N high
+.Ldec_blocks_less_than_1: // blocks left <= 1
+ and x1, x1, #127 // bit_length %= 128
+ mvn x14, xzr // rkN_h = 0xffffffffffffffff
+ sub x1, x1, #128 // bit_length -= 128
+ mvn x13, xzr // rkN_l = 0xffffffffffffffff
+ ldp x4, x5, [x2] // load existing bytes we need to not overwrite
+ neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
+ and x1, x1, #127 // bit_length %= 128
+ lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
+ cmp x1, #64
+ csel x9, x13, x14, lt
+ csel x10, x14, xzr, lt
+ fmov d0, x9 // ctr0b is mask for last block
+ and x6, x6, x9
+ mov v0.d[1], x10
+ bic x4, x4, x9 // mask out low existing bytes
+ rev w9, w12
+ bic x5, x5, x10 // mask out high existing bytes
+ orr x6, x6, x4
+ and x7, x7, x10
+ orr x7, x7, x5
+ and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
+ rev64 v4.16b, v5.16b // GHASH final block
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
+ mov d8, v4.d[1] // GHASH final block - mid
+ eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
+ pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
+ pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final block - high
+ eor v11.16b, v11.16b, v21.16b // GHASH final block - low
+ eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
+ movi v8.8b, #0xc2
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ shl d8, d8, #56 // mod_constant
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ stp x6, x7, [x2]
+ str w9, [x16, #12] // store the updated counter
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ mov x0, x15
+ st1 { v11.16b }, [x3]
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp d8, d9, [sp, #64]
+ ldp d10, d11, [sp, #80]
+ ldp d12, d13, [sp, #96]
+ ldp d14, d15, [sp, #112]
+ ldp x29, x30, [sp], #128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/aesv8-gcm-armv8-win.S b/gen/bcm/aesv8-gcm-armv8-win.S
new file mode 100644
index 0000000..1233796
--- /dev/null
+++ b/gen/bcm/aesv8-gcm-armv8-win.S
@@ -0,0 +1,1559 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+#if __ARM_MAX_ARCH__ >= 8
+
+.arch armv8-a+crypto
+.text
+.globl aes_gcm_enc_kernel
+
+.def aes_gcm_enc_kernel
+ .type 32
+.endef
+.align 4
+aes_gcm_enc_kernel:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-128]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp d8, d9, [sp, #64]
+ stp d10, d11, [sp, #80]
+ stp d12, d13, [sp, #96]
+ stp d14, d15, [sp, #112]
+ ldr w17, [x8, #240]
+ add x19, x8, x17, lsl #4 // borrow input_l1 for last key
+ ldp x13, x14, [x19] // load round N keys
+ ldr q31, [x19, #-16] // load round N-1 keys
+ add x4, x0, x1, lsr #3 // end_input_ptr
+ lsr x5, x1, #3 // byte_len
+ mov x15, x5
+ ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
+ ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
+ sub x5, x5, #1 // byte_len - 1
+ ldr q18, [x8, #0] // load rk0
+ and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ ldr q25, [x8, #112] // load rk7
+ add x5, x5, x0
+ lsr x12, x11, #32
+ fmov d2, x10 // CTR block 2
+ orr w11, w11, w11
+ rev w12, w12 // rev_ctr32
+ fmov d1, x10 // CTR block 1
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 0
+ add w12, w12, #1 // increment rev_ctr32
+ rev w9, w12 // CTR block 1
+ fmov d3, x10 // CTR block 3
+ orr x9, x11, x9, lsl #32 // CTR block 1
+ add w12, w12, #1 // CTR block 1
+ ldr q19, [x8, #16] // load rk1
+ fmov v1.d[1], x9 // CTR block 1
+ rev w9, w12 // CTR block 2
+ add w12, w12, #1 // CTR block 2
+ orr x9, x11, x9, lsl #32 // CTR block 2
+ ldr q20, [x8, #32] // load rk2
+ fmov v2.d[1], x9 // CTR block 2
+ rev w9, w12 // CTR block 3
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 1
+ orr x9, x11, x9, lsl #32 // CTR block 3
+ fmov v3.d[1], x9 // CTR block 3
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 0
+ ldr q21, [x8, #48] // load rk3
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 2
+ ldr q24, [x8, #96] // load rk6
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 0
+ ldr q23, [x8, #80] // load rk5
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 1
+ ldr q14, [x6, #48] // load h3l | h3h
+ ext v14.16b, v14.16b, v14.16b, #8
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 0
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 1
+ ldr q22, [x8, #64] // load rk4
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 2
+ ldr q13, [x6, #32] // load h2l | h2h
+ ext v13.16b, v13.16b, v13.16b, #8
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 1
+ ldr q30, [x8, #192] // load rk12
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 2
+ ldr q15, [x6, #80] // load h4l | h4h
+ ext v15.16b, v15.16b, v15.16b, #8
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 3
+ ldr q29, [x8, #176] // load rk11
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 2
+ ldr q26, [x8, #128] // load rk8
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 3
+ add w12, w12, #1 // CTR block 3
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 3
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 3
+ ld1 { v11.16b}, [x3]
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 4
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 4
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 4
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 4
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 5
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 5
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 5
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 6
+ trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 6
+ ldr q27, [x8, #144] // load rk9
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 6
+ ldr q12, [x6] // load h1l | h1h
+ ext v12.16b, v12.16b, v12.16b, #8
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 6
+ ldr q28, [x8, #160] // load rk10
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 7
+ trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 7
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 7
+ trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 8
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 8
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 8
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 8
+ b.lt Lenc_finish_first_blocks // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 9
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 10
+ b.eq Lenc_finish_first_blocks // branch if AES-192
+
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 12
+
+Lenc_finish_first_blocks:
+ cmp x0, x5 // check if we have <= 4 blocks
+ eor v17.16b, v17.16b, v9.16b // h4k | h3k
+ aese v2.16b, v31.16b // AES block 2 - round N-1
+ trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
+ aese v1.16b, v31.16b // AES block 1 - round N-1
+ aese v0.16b, v31.16b // AES block 0 - round N-1
+ aese v3.16b, v31.16b // AES block 3 - round N-1
+ eor v16.16b, v16.16b, v8.16b // h2k | h1k
+ b.ge Lenc_tail // handle tail
+
+ ldp x19, x20, [x0, #16] // AES block 1 - load plaintext
+ rev w9, w12 // CTR block 4
+ ldp x6, x7, [x0, #0] // AES block 0 - load plaintext
+ ldp x23, x24, [x0, #48] // AES block 3 - load plaintext
+ ldp x21, x22, [x0, #32] // AES block 2 - load plaintext
+ add x0, x0, #64 // AES input_ptr update
+ eor x19, x19, x13 // AES block 1 - round N low
+ eor x20, x20, x14 // AES block 1 - round N high
+ fmov d5, x19 // AES block 1 - mov low
+ eor x6, x6, x13 // AES block 0 - round N low
+ eor x7, x7, x14 // AES block 0 - round N high
+ eor x24, x24, x14 // AES block 3 - round N high
+ fmov d4, x6 // AES block 0 - mov low
+ cmp x0, x5 // check if we have <= 8 blocks
+ fmov v4.d[1], x7 // AES block 0 - mov high
+ eor x23, x23, x13 // AES block 3 - round N low
+ eor x21, x21, x13 // AES block 2 - round N low
+ fmov v5.d[1], x20 // AES block 1 - mov high
+ fmov d6, x21 // AES block 2 - mov low
+ add w12, w12, #1 // CTR block 4
+ orr x9, x11, x9, lsl #32 // CTR block 4
+ fmov d7, x23 // AES block 3 - mov low
+ eor x22, x22, x14 // AES block 2 - round N high
+ fmov v6.d[1], x22 // AES block 2 - mov high
+ eor v4.16b, v4.16b, v0.16b // AES block 0 - result
+ fmov d0, x10 // CTR block 4
+ fmov v0.d[1], x9 // CTR block 4
+ rev w9, w12 // CTR block 5
+ add w12, w12, #1 // CTR block 5
+ eor v5.16b, v5.16b, v1.16b // AES block 1 - result
+ fmov d1, x10 // CTR block 5
+ orr x9, x11, x9, lsl #32 // CTR block 5
+ fmov v1.d[1], x9 // CTR block 5
+ rev w9, w12 // CTR block 6
+ st1 { v4.16b}, [x2], #16 // AES block 0 - store result
+ fmov v7.d[1], x24 // AES block 3 - mov high
+ orr x9, x11, x9, lsl #32 // CTR block 6
+ eor v6.16b, v6.16b, v2.16b // AES block 2 - result
+ st1 { v5.16b}, [x2], #16 // AES block 1 - store result
+ add w12, w12, #1 // CTR block 6
+ fmov d2, x10 // CTR block 6
+ fmov v2.d[1], x9 // CTR block 6
+ st1 { v6.16b}, [x2], #16 // AES block 2 - store result
+ rev w9, w12 // CTR block 7
+ orr x9, x11, x9, lsl #32 // CTR block 7
+ eor v7.16b, v7.16b, v3.16b // AES block 3 - result
+ st1 { v7.16b}, [x2], #16 // AES block 3 - store result
+ b.ge Lenc_prepretail // do prepretail
+
+Lenc_main_loop: // main loop start
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d3, x10 // CTR block 4k+3
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ fmov v3.d[1], x9 // CTR block 4k+3
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ ldp x23, x24, [x0, #48] // AES block 4k+7 - load plaintext
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ ldp x21, x22, [x0, #32] // AES block 4k+6 - load plaintext
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ eor x23, x23, x13 // AES block 4k+7 - round N low
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ eor x22, x22, x14 // AES block 4k+6 - round N high
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ ldp x19, x20, [x0, #16] // AES block 4k+5 - load plaintext
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ mov d4, v7.d[1] // GHASH block 4k+3 - mid
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor x19, x19, x13 // AES block 4k+5 - round N low
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ eor x21, x21, x13 // AES block 4k+6 - round N low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ movi v8.8b, #0xc2
+ pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ fmov d5, x19 // AES block 4k+5 - mov low
+ ldp x6, x7, [x0, #0] // AES block 4k+4 - load plaintext
+ b.lt Lenc_main_loop_continue // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ b.eq Lenc_main_loop_continue // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+Lenc_main_loop_continue:
+ shl d8, d8, #56 // mod_constant
+ eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
+ add w12, w12, #1 // CTR block 4k+3
+ eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ add x0, x0, #64 // AES input_ptr update
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ rev w9, w12 // CTR block 4k+8
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ fmov d4, x6 // AES block 4k+4 - mov low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+8
+ eor v7.16b, v9.16b, v7.16b // MODULO - fold into mid
+ eor x20, x20, x14 // AES block 4k+5 - round N high
+ eor x24, x24, x14 // AES block 4k+7 - round N high
+ add w12, w12, #1 // CTR block 4k+8
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ fmov v4.d[1], x7 // AES block 4k+4 - mov high
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ fmov d7, x23 // AES block 4k+7 - mov low
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ fmov v5.d[1], x20 // AES block 4k+5 - mov high
+ fmov d6, x21 // AES block 4k+6 - mov low
+ cmp x0, x5 // LOOP CONTROL
+ fmov v6.d[1], x22 // AES block 4k+6 - mov high
+ pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ fmov d0, x10 // CTR block 4k+8
+ fmov v0.d[1], x9 // CTR block 4k+8
+ rev w9, w12 // CTR block 4k+9
+ add w12, w12, #1 // CTR block 4k+9
+ eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result
+ fmov d1, x10 // CTR block 4k+9
+ orr x9, x11, x9, lsl #32 // CTR block 4k+9
+ fmov v1.d[1], x9 // CTR block 4k+9
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ rev w9, w12 // CTR block 4k+10
+ st1 { v4.16b}, [x2], #16 // AES block 4k+4 - store result
+ orr x9, x11, x9, lsl #32 // CTR block 4k+10
+ eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
+ fmov v7.d[1], x24 // AES block 4k+7 - mov high
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ st1 { v5.16b}, [x2], #16 // AES block 4k+5 - store result
+ add w12, w12, #1 // CTR block 4k+10
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result
+ fmov d2, x10 // CTR block 4k+10
+ st1 { v6.16b}, [x2], #16 // AES block 4k+6 - store result
+ fmov v2.d[1], x9 // CTR block 4k+10
+ rev w9, w12 // CTR block 4k+11
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+11
+ eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result
+ st1 { v7.16b}, [x2], #16 // AES block 4k+7 - store result
+ b.lt Lenc_main_loop
+
+Lenc_prepretail: // PREPRETAIL
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ rev64 v6.16b, v6.16b // GHASH block 4k+2 (t0, t1, and t2 free)
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ fmov d3, x10 // CTR block 4k+3
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ rev64 v4.16b, v4.16b // GHASH block 4k (only t0 is free)
+ fmov v3.d[1], x9 // CTR block 4k+3
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev64 v5.16b, v5.16b // GHASH block 4k+1 (t0 and t1 free)
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ rev64 v7.16b, v7.16b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ add w12, w12, #1 // CTR block 4k+3
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ mov d4, v7.d[1] // GHASH block 4k+3 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ eor v4.8b, v4.8b, v7.8b // GHASH block 4k+3 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ pmull v4.1q, v4.1d, v16.1d // GHASH block 4k+3 - mid
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ movi v8.8b, #0xc2
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ shl d8, d8, #56 // mod_constant
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+3 - mid
+ pmull v6.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ eor v11.16b, v11.16b, v6.16b // GHASH block 4k+3 - low
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v10.16b, v10.16b, v9.16b // karatsuba tidy up
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ pmull v4.1q, v9.1d, v8.1d
+ ext v9.16b, v9.16b, v9.16b, #8
+ eor v10.16b, v10.16b, v11.16b
+ b.lt Lenc_finish_prepretail // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ b.eq Lenc_finish_prepretail // branch if AES-192
+
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+
+Lenc_finish_prepretail:
+ eor v10.16b, v10.16b, v4.16b
+ eor v10.16b, v10.16b, v9.16b
+ pmull v4.1q, v10.1d, v8.1d
+ ext v10.16b, v10.16b, v10.16b, #8
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ eor v11.16b, v11.16b, v4.16b
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ eor v11.16b, v11.16b, v10.16b
+
+Lenc_tail: // TAIL
+ ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
+ sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
+ ldp x6, x7, [x0], #16 // AES block 4k+4 - load plaintext
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ cmp x5, #48
+ fmov d4, x6 // AES block 4k+4 - mov low
+ fmov v4.d[1], x7 // AES block 4k+4 - mov high
+ eor v5.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ b.gt Lenc_blocks_more_than_3
+ cmp x5, #32
+ mov v3.16b, v2.16b
+ movi v11.8b, #0
+ movi v9.8b, #0
+ sub w12, w12, #1
+ mov v2.16b, v1.16b
+ movi v10.8b, #0
+ b.gt Lenc_blocks_more_than_2
+ mov v3.16b, v1.16b
+ sub w12, w12, #1
+ cmp x5, #16
+ b.gt Lenc_blocks_more_than_1
+ sub w12, w12, #1
+ b Lenc_blocks_less_than_1
+Lenc_blocks_more_than_3: // blocks left > 3
+ st1 { v5.16b}, [x2], #16 // AES final-3 block - store result
+ ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high
+ rev64 v4.16b, v5.16b // GHASH final-3 block
+ eor x6, x6, x13 // AES final-2 block - round N low
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ eor x7, x7, x14 // AES final-2 block - round N high
+ mov d22, v4.d[1] // GHASH final-3 block - mid
+ fmov d5, x6 // AES final-2 block - mov low
+ fmov v5.d[1], x7 // AES final-2 block - mov high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
+ movi v8.8b, #0 // suppress further partial tag feed in
+ mov d10, v17.d[1] // GHASH final-3 block - mid
+ pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
+ pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
+ eor v5.16b, v5.16b, v1.16b // AES final-2 block - result
+Lenc_blocks_more_than_2: // blocks left > 2
+ st1 { v5.16b}, [x2], #16 // AES final-2 block - store result
+ ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high
+ rev64 v4.16b, v5.16b // GHASH final-2 block
+ eor x6, x6, x13 // AES final-1 block - round N low
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ fmov d5, x6 // AES final-1 block - mov low
+ eor x7, x7, x14 // AES final-1 block - round N high
+ fmov v5.d[1], x7 // AES final-1 block - mov high
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
+ mov d22, v4.d[1] // GHASH final-2 block - mid
+ pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
+ eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
+ eor v5.16b, v5.16b, v2.16b // AES final-1 block - result
+ eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
+ pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
+ eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
+Lenc_blocks_more_than_1: // blocks left > 1
+ st1 { v5.16b}, [x2], #16 // AES final-1 block - store result
+ rev64 v4.16b, v5.16b // GHASH final-1 block
+ ldp x6, x7, [x0], #16 // AES final block - load input low & high
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ movi v8.8b, #0 // suppress further partial tag feed in
+ eor x6, x6, x13 // AES final block - round N low
+ mov d22, v4.d[1] // GHASH final-1 block - mid
+ pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
+ eor x7, x7, x14 // AES final block - round N high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
+ ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
+ fmov d5, x6 // AES final block - mov low
+ fmov v5.d[1], x7 // AES final block - mov high
+ pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
+ pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
+ eor v5.16b, v5.16b, v3.16b // AES final block - result
+ eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
+Lenc_blocks_less_than_1: // blocks left <= 1
+ and x1, x1, #127 // bit_length %= 128
+ mvn x13, xzr // rkN_l = 0xffffffffffffffff
+ sub x1, x1, #128 // bit_length -= 128
+ neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
+ ld1 { v18.16b}, [x2] // load existing bytes where the possibly partial last block is to be stored
+ mvn x14, xzr // rkN_h = 0xffffffffffffffff
+ and x1, x1, #127 // bit_length %= 128
+ lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
+ cmp x1, #64
+ csel x6, x13, x14, lt
+ csel x7, x14, xzr, lt
+ fmov d0, x6 // ctr0b is mask for last block
+ fmov v0.d[1], x7
+ and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
+ rev64 v4.16b, v5.16b // GHASH final block
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ bif v5.16b, v18.16b, v0.16b // insert existing bytes in top end of result before storing
+ pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
+ mov d8, v4.d[1] // GHASH final block - mid
+ rev w9, w12
+ pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
+ eor v9.16b, v9.16b, v20.16b // GHASH final block - high
+ eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
+ pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
+ eor v11.16b, v11.16b, v21.16b // GHASH final block - low
+ eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
+ movi v8.8b, #0xc2
+ eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ shl d8, d8, #56 // mod_constant
+ eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ str w9, [x16, #12] // store the updated counter
+ st1 { v5.16b}, [x2] // store all 16B
+ eor v11.16b, v11.16b, v9.16b // MODULO - fold into low
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ mov x0, x15
+ st1 { v11.16b }, [x3]
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp d8, d9, [sp, #64]
+ ldp d10, d11, [sp, #80]
+ ldp d12, d13, [sp, #96]
+ ldp d14, d15, [sp, #112]
+ ldp x29, x30, [sp], #128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl aes_gcm_dec_kernel
+
+.def aes_gcm_dec_kernel
+ .type 32
+.endef
+.align 4
+aes_gcm_dec_kernel:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-128]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp d8, d9, [sp, #64]
+ stp d10, d11, [sp, #80]
+ stp d12, d13, [sp, #96]
+ stp d14, d15, [sp, #112]
+ ldr w17, [x8, #240]
+ add x19, x8, x17, lsl #4 // borrow input_l1 for last key
+ ldp x13, x14, [x19] // load round N keys
+ ldr q31, [x19, #-16] // load round N-1 keys
+ lsr x5, x1, #3 // byte_len
+ mov x15, x5
+ ldp x10, x11, [x16] // ctr96_b64, ctr96_t32
+ ldr q26, [x8, #128] // load rk8
+ sub x5, x5, #1 // byte_len - 1
+ ldr q25, [x8, #112] // load rk7
+ and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ add x4, x0, x1, lsr #3 // end_input_ptr
+ ldr q24, [x8, #96] // load rk6
+ lsr x12, x11, #32
+ ldr q23, [x8, #80] // load rk5
+ orr w11, w11, w11
+ ldr q21, [x8, #48] // load rk3
+ add x5, x5, x0
+ rev w12, w12 // rev_ctr32
+ add w12, w12, #1 // increment rev_ctr32
+ fmov d3, x10 // CTR block 3
+ rev w9, w12 // CTR block 1
+ add w12, w12, #1 // CTR block 1
+ fmov d1, x10 // CTR block 1
+ orr x9, x11, x9, lsl #32 // CTR block 1
+ ld1 { v0.16b}, [x16] // special case vector load initial counter so we can start first AES block as quickly as possible
+ fmov v1.d[1], x9 // CTR block 1
+ rev w9, w12 // CTR block 2
+ add w12, w12, #1 // CTR block 2
+ fmov d2, x10 // CTR block 2
+ orr x9, x11, x9, lsl #32 // CTR block 2
+ fmov v2.d[1], x9 // CTR block 2
+ rev w9, w12 // CTR block 3
+ orr x9, x11, x9, lsl #32 // CTR block 3
+ ldr q18, [x8, #0] // load rk0
+ fmov v3.d[1], x9 // CTR block 3
+ add w12, w12, #1 // CTR block 3
+ ldr q22, [x8, #64] // load rk4
+ ldr q19, [x8, #16] // load rk1
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 0
+ ldr q14, [x6, #48] // load h3l | h3h
+ ext v14.16b, v14.16b, v14.16b, #8
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 0
+ ldr q15, [x6, #80] // load h4l | h4h
+ ext v15.16b, v15.16b, v15.16b, #8
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 0
+ ldr q13, [x6, #32] // load h2l | h2h
+ ext v13.16b, v13.16b, v13.16b, #8
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 0
+ ldr q20, [x8, #32] // load rk2
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 1
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 1
+ ld1 { v11.16b}, [x3]
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 1
+ ldr q27, [x8, #144] // load rk9
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 1
+ ldr q30, [x8, #192] // load rk12
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 2
+ ldr q12, [x6] // load h1l | h1h
+ ext v12.16b, v12.16b, v12.16b, #8
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 2
+ ldr q28, [x8, #160] // load rk10
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 2
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 3
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 2
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 3
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 4
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 3
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 3
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 4
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 4
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 4
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 5
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 5
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 5
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 5
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 6
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 6
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 6
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 6
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 7
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 7
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 7
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 8
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 7
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 8
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 8
+ ldr q29, [x8, #176] // load rk11
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 8
+ b.lt Ldec_finish_first_blocks // branch if AES-128
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 9
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 10
+ b.eq Ldec_finish_first_blocks // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 1 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 0 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 2 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 3 - round 12
+
+Ldec_finish_first_blocks:
+ cmp x0, x5 // check if we have <= 4 blocks
+ trn1 v9.2d, v14.2d, v15.2d // h4h | h3h
+ trn2 v17.2d, v14.2d, v15.2d // h4l | h3l
+ trn1 v8.2d, v12.2d, v13.2d // h2h | h1h
+ trn2 v16.2d, v12.2d, v13.2d // h2l | h1l
+ eor v17.16b, v17.16b, v9.16b // h4k | h3k
+ aese v1.16b, v31.16b // AES block 1 - round N-1
+ aese v2.16b, v31.16b // AES block 2 - round N-1
+ eor v16.16b, v16.16b, v8.16b // h2k | h1k
+ aese v3.16b, v31.16b // AES block 3 - round N-1
+ aese v0.16b, v31.16b // AES block 0 - round N-1
+ b.ge Ldec_tail // handle tail
+
+ ldr q4, [x0, #0] // AES block 0 - load ciphertext
+ ldr q5, [x0, #16] // AES block 1 - load ciphertext
+ rev w9, w12 // CTR block 4
+ eor v0.16b, v4.16b, v0.16b // AES block 0 - result
+ eor v1.16b, v5.16b, v1.16b // AES block 1 - result
+ rev64 v5.16b, v5.16b // GHASH block 1
+ ldr q7, [x0, #48] // AES block 3 - load ciphertext
+ mov x7, v0.d[1] // AES block 0 - mov high
+ mov x6, v0.d[0] // AES block 0 - mov low
+ rev64 v4.16b, v4.16b // GHASH block 0
+ add w12, w12, #1 // CTR block 4
+ fmov d0, x10 // CTR block 4
+ orr x9, x11, x9, lsl #32 // CTR block 4
+ fmov v0.d[1], x9 // CTR block 4
+ rev w9, w12 // CTR block 5
+ add w12, w12, #1 // CTR block 5
+ mov x19, v1.d[0] // AES block 1 - mov low
+ orr x9, x11, x9, lsl #32 // CTR block 5
+ mov x20, v1.d[1] // AES block 1 - mov high
+ eor x7, x7, x14 // AES block 0 - round N high
+ eor x6, x6, x13 // AES block 0 - round N low
+ stp x6, x7, [x2], #16 // AES block 0 - store result
+ fmov d1, x10 // CTR block 5
+ ldr q6, [x0, #32] // AES block 2 - load ciphertext
+ add x0, x0, #64 // AES input_ptr update
+ fmov v1.d[1], x9 // CTR block 5
+ rev w9, w12 // CTR block 6
+ add w12, w12, #1 // CTR block 6
+ eor x19, x19, x13 // AES block 1 - round N low
+ orr x9, x11, x9, lsl #32 // CTR block 6
+ eor x20, x20, x14 // AES block 1 - round N high
+ stp x19, x20, [x2], #16 // AES block 1 - store result
+ eor v2.16b, v6.16b, v2.16b // AES block 2 - result
+ cmp x0, x5 // check if we have <= 8 blocks
+ b.ge Ldec_prepretail // do prepretail
+
+Ldec_main_loop: // main loop start
+ mov x21, v2.d[0] // AES block 4k+2 - mov low
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ mov x22, v2.d[1] // AES block 4k+2 - mov high
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d2, x10 // CTR block 4k+6
+ fmov v2.d[1], x9 // CTR block 4k+6
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev w9, w12 // CTR block 4k+7
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ mov x24, v3.d[1] // AES block 4k+3 - mov high
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ mov x23, v3.d[0] // AES block 4k+3 - mov low
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ fmov d3, x10 // CTR block 4k+7
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ orr x9, x11, x9, lsl #32 // CTR block 4k+7
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ fmov v3.d[1], x9 // CTR block 4k+7
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ eor x22, x22, x14 // AES block 4k+2 - round N high
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ rev64 v6.16b, v6.16b // GHASH block 4k+2
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ eor x21, x21, x13 // AES block 4k+2 - round N low
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ stp x21, x22, [x2], #16 // AES block 4k+2 - store result
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ rev64 v7.16b, v7.16b // GHASH block 4k+3
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ eor x23, x23, x13 // AES block 4k+3 - round N low
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ eor x24, x24, x14 // AES block 4k+3 - round N high
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ add w12, w12, #1 // CTR block 4k+7
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ rev w9, w12 // CTR block 4k+8
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ add w12, w12, #1 // CTR block 4k+8
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ mov d6, v7.d[1] // GHASH block 4k+3 - mid
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ orr x9, x11, x9, lsl #32 // CTR block 4k+8
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
+ movi v8.8b, #0xc2
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ shl d8, d8, #56 // mod_constant
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ b.lt Ldec_main_loop_continue // branch if AES-128
+
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ b.eq Ldec_main_loop_continue // branch if AES-192
+
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+Ldec_main_loop_continue:
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ ldr q4, [x0, #0] // AES block 4k+4 - load ciphertext
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ ldr q5, [x0, #16] // AES block 4k+5 - load ciphertext
+ eor v0.16b, v4.16b, v0.16b // AES block 4k+4 - result
+ stp x23, x24, [x2], #16 // AES block 4k+3 - store result
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ ldr q7, [x0, #48] // AES block 4k+7 - load ciphertext
+ ldr q6, [x0, #32] // AES block 4k+6 - load ciphertext
+ mov x7, v0.d[1] // AES block 4k+4 - mov high
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ add x0, x0, #64 // AES input_ptr update
+ mov x6, v0.d[0] // AES block 4k+4 - mov low
+ fmov d0, x10 // CTR block 4k+8
+ fmov v0.d[1], x9 // CTR block 4k+8
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor v1.16b, v5.16b, v1.16b // AES block 4k+5 - result
+ rev w9, w12 // CTR block 4k+9
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ orr x9, x11, x9, lsl #32 // CTR block 4k+9
+ cmp x0, x5 // LOOP CONTROL
+ add w12, w12, #1 // CTR block 4k+9
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ mov x20, v1.d[1] // AES block 4k+5 - mov high
+ eor v2.16b, v6.16b, v2.16b // AES block 4k+6 - result
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ mov x19, v1.d[0] // AES block 4k+5 - mov low
+ fmov d1, x10 // CTR block 4k+9
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ fmov v1.d[1], x9 // CTR block 4k+9
+ rev w9, w12 // CTR block 4k+10
+ add w12, w12, #1 // CTR block 4k+10
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ orr x9, x11, x9, lsl #32 // CTR block 4k+10
+ rev64 v5.16b, v5.16b // GHASH block 4k+5
+ eor x20, x20, x14 // AES block 4k+5 - round N high
+ stp x6, x7, [x2], #16 // AES block 4k+4 - store result
+ eor x19, x19, x13 // AES block 4k+5 - round N low
+ stp x19, x20, [x2], #16 // AES block 4k+5 - store result
+ rev64 v4.16b, v4.16b // GHASH block 4k+4
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ b.lt Ldec_main_loop
+
+Ldec_prepretail: // PREPRETAIL
+ ext v11.16b, v11.16b, v11.16b, #8 // PRE 0
+ mov x21, v2.d[0] // AES block 4k+2 - mov low
+ eor v3.16b, v7.16b, v3.16b // AES block 4k+3 - result
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 0
+ mov x22, v2.d[1] // AES block 4k+2 - mov high
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 0
+ fmov d2, x10 // CTR block 4k+6
+ fmov v2.d[1], x9 // CTR block 4k+6
+ rev w9, w12 // CTR block 4k+7
+ eor v4.16b, v4.16b, v11.16b // PRE 1
+ rev64 v6.16b, v6.16b // GHASH block 4k+2
+ orr x9, x11, x9, lsl #32 // CTR block 4k+7
+ mov x23, v3.d[0] // AES block 4k+3 - mov low
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 1
+ mov x24, v3.d[1] // AES block 4k+3 - mov high
+ pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low
+ mov d8, v4.d[1] // GHASH block 4k - mid
+ fmov d3, x10 // CTR block 4k+7
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high
+ fmov v3.d[1], x9 // CTR block 4k+7
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 0
+ mov d10, v17.d[1] // GHASH block 4k - mid
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 1
+ eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid
+ pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 1
+ rev64 v7.16b, v7.16b // GHASH block 4k+3
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 0
+ pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high
+ pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 1
+ mov d4, v5.d[1] // GHASH block 4k+1 - mid
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 2
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 2
+ eor v11.16b, v11.16b, v8.16b // GHASH block 4k+1 - low
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 2
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 3
+ mov d8, v6.d[1] // GHASH block 4k+2 - mid
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 2
+ eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid
+ pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 4
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 3
+ eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid
+ pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 5
+ eor v11.16b, v11.16b, v5.16b // GHASH block 4k+2 - low
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 4
+ pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high
+ eor v10.16b, v10.16b, v4.16b // GHASH block 4k+1 - mid
+ pmull2 v4.1q, v6.2d, v13.2d // GHASH block 4k+2 - high
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 5
+ ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 3
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 3
+ eor v9.16b, v9.16b, v4.16b // GHASH block 4k+2 - high
+ pmull v4.1q, v7.1d, v12.1d // GHASH block 4k+3 - low
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 4
+ mov d6, v7.d[1] // GHASH block 4k+3 - mid
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 4
+ pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 5
+ eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 5
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 6
+ eor v10.16b, v10.16b, v8.16b // GHASH block 4k+2 - mid
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 6
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 6
+ movi v8.8b, #0xc2
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 6
+ eor v11.16b, v11.16b, v4.16b // GHASH block 4k+3 - low
+ pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 7
+ cmp x17, #12 // setup flags for AES-128/192/256 check
+ eor v9.16b, v9.16b, v5.16b // GHASH block 4k+3 - high
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 7
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 7
+ eor v10.16b, v10.16b, v6.16b // GHASH block 4k+3 - mid
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 8
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 7
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 8
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 8
+ shl d8, d8, #56 // mod_constant
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 8
+ b.lt Ldec_finish_prepretail // branch if AES-128
+
+ aese v1.16b, v27.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 9
+ aese v2.16b, v27.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 9
+ aese v3.16b, v27.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 9
+ aese v0.16b, v27.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 9
+ aese v2.16b, v28.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 10
+ aese v3.16b, v28.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 10
+ aese v0.16b, v28.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 10
+ aese v1.16b, v28.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 10
+ b.eq Ldec_finish_prepretail // branch if AES-192
+
+ aese v2.16b, v29.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 11
+ aese v0.16b, v29.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 11
+ aese v1.16b, v29.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 11
+ aese v2.16b, v30.16b
+ aesmc v2.16b, v2.16b // AES block 4k+6 - round 12
+ aese v3.16b, v29.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 11
+ aese v1.16b, v30.16b
+ aesmc v1.16b, v1.16b // AES block 4k+5 - round 12
+ aese v0.16b, v30.16b
+ aesmc v0.16b, v0.16b // AES block 4k+4 - round 12
+ aese v3.16b, v30.16b
+ aesmc v3.16b, v3.16b // AES block 4k+7 - round 12
+
+Ldec_finish_prepretail:
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor x22, x22, x14 // AES block 4k+2 - round N high
+ eor x23, x23, x13 // AES block 4k+3 - round N low
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ add w12, w12, #1 // CTR block 4k+7
+ eor x21, x21, x13 // AES block 4k+2 - round N low
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ eor x24, x24, x14 // AES block 4k+3 - round N high
+ stp x21, x22, [x2], #16 // AES block 4k+2 - store result
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ stp x23, x24, [x2], #16 // AES block 4k+3 - store result
+
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ aese v1.16b, v31.16b // AES block 4k+5 - round N-1
+ aese v0.16b, v31.16b // AES block 4k+4 - round N-1
+ aese v3.16b, v31.16b // AES block 4k+7 - round N-1
+ aese v2.16b, v31.16b // AES block 4k+6 - round N-1
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+
+Ldec_tail: // TAIL
+ sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process
+ ld1 { v5.16b}, [x0], #16 // AES block 4k+4 - load ciphertext
+ eor v0.16b, v5.16b, v0.16b // AES block 4k+4 - result
+ mov x6, v0.d[0] // AES block 4k+4 - mov low
+ mov x7, v0.d[1] // AES block 4k+4 - mov high
+ ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag
+ cmp x5, #48
+ eor x6, x6, x13 // AES block 4k+4 - round N low
+ eor x7, x7, x14 // AES block 4k+4 - round N high
+ b.gt Ldec_blocks_more_than_3
+ sub w12, w12, #1
+ mov v3.16b, v2.16b
+ movi v10.8b, #0
+ movi v11.8b, #0
+ cmp x5, #32
+ movi v9.8b, #0
+ mov v2.16b, v1.16b
+ b.gt Ldec_blocks_more_than_2
+ sub w12, w12, #1
+ mov v3.16b, v1.16b
+ cmp x5, #16
+ b.gt Ldec_blocks_more_than_1
+ sub w12, w12, #1
+ b Ldec_blocks_less_than_1
+Ldec_blocks_more_than_3: // blocks left > 3
+ rev64 v4.16b, v5.16b // GHASH final-3 block
+ ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext
+ stp x6, x7, [x2], #16 // AES final-3 block - store result
+ mov d10, v17.d[1] // GHASH final-3 block - mid
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ eor v0.16b, v5.16b, v1.16b // AES final-2 block - result
+ mov d22, v4.d[1] // GHASH final-3 block - mid
+ mov x6, v0.d[0] // AES final-2 block - mov low
+ mov x7, v0.d[1] // AES final-2 block - mov high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high
+ pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid
+ eor x6, x6, x13 // AES final-2 block - round N low
+ pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low
+ eor x7, x7, x14 // AES final-2 block - round N high
+Ldec_blocks_more_than_2: // blocks left > 2
+ rev64 v4.16b, v5.16b // GHASH final-2 block
+ ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ stp x6, x7, [x2], #16 // AES final-2 block - store result
+ eor v0.16b, v5.16b, v2.16b // AES final-1 block - result
+ mov d22, v4.d[1] // GHASH final-2 block - mid
+ pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low
+ pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid
+ mov x6, v0.d[0] // AES final-1 block - mov low
+ mov x7, v0.d[1] // AES final-1 block - mov high
+ eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low
+ movi v8.8b, #0 // suppress further partial tag feed in
+ pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high
+ eor x6, x6, x13 // AES final-1 block - round N low
+ eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid
+ eor x7, x7, x14 // AES final-1 block - round N high
+Ldec_blocks_more_than_1: // blocks left > 1
+ stp x6, x7, [x2], #16 // AES final-1 block - store result
+ rev64 v4.16b, v5.16b // GHASH final-1 block
+ ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ movi v8.8b, #0 // suppress further partial tag feed in
+ mov d22, v4.d[1] // GHASH final-1 block - mid
+ eor v0.16b, v5.16b, v3.16b // AES final block - result
+ pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high
+ eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid
+ pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low
+ mov x6, v0.d[0] // AES final block - mov low
+ ins v22.d[1], v22.d[0] // GHASH final-1 block - mid
+ mov x7, v0.d[1] // AES final block - mov high
+ pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid
+ eor x6, x6, x13 // AES final block - round N low
+ eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low
+ eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high
+ eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid
+ eor x7, x7, x14 // AES final block - round N high
+Ldec_blocks_less_than_1: // blocks left <= 1
+ and x1, x1, #127 // bit_length %= 128
+ mvn x14, xzr // rkN_h = 0xffffffffffffffff
+ sub x1, x1, #128 // bit_length -= 128
+ mvn x13, xzr // rkN_l = 0xffffffffffffffff
+ ldp x4, x5, [x2] // load existing bytes we need to not overwrite
+ neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128])
+ and x1, x1, #127 // bit_length %= 128
+ lsr x14, x14, x1 // rkN_h is mask for top 64b of last block
+ cmp x1, #64
+ csel x9, x13, x14, lt
+ csel x10, x14, xzr, lt
+ fmov d0, x9 // ctr0b is mask for last block
+ and x6, x6, x9
+ mov v0.d[1], x10
+ bic x4, x4, x9 // mask out low existing bytes
+ rev w9, w12
+ bic x5, x5, x10 // mask out high existing bytes
+ orr x6, x6, x4
+ and x7, x7, x10
+ orr x7, x7, x5
+ and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits
+ rev64 v4.16b, v5.16b // GHASH final block
+ eor v4.16b, v4.16b, v8.16b // feed in partial tag
+ pmull v21.1q, v4.1d, v12.1d // GHASH final block - low
+ mov d8, v4.d[1] // GHASH final block - mid
+ eor v8.8b, v8.8b, v4.8b // GHASH final block - mid
+ pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high
+ pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid
+ eor v9.16b, v9.16b, v20.16b // GHASH final block - high
+ eor v11.16b, v11.16b, v21.16b // GHASH final block - low
+ eor v10.16b, v10.16b, v8.16b // GHASH final block - mid
+ movi v8.8b, #0xc2
+ eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up
+ shl d8, d8, #56 // mod_constant
+ eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up
+ pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid
+ ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment
+ eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid
+ eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid
+ pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low
+ ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment
+ eor v11.16b, v11.16b, v8.16b // MODULO - fold into low
+ stp x6, x7, [x2]
+ str w9, [x16, #12] // store the updated counter
+ eor v11.16b, v11.16b, v10.16b // MODULO - fold into low
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ mov x0, x15
+ st1 { v11.16b }, [x3]
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp d8, d9, [sp, #64]
+ ldp d10, d11, [sp, #80]
+ ldp d12, d13, [sp, #96]
+ ldp d14, d15, [sp, #112]
+ ldp x29, x30, [sp], #128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/armv4-mont-linux.S b/gen/bcm/armv4-mont-linux.S
new file mode 100644
index 0000000..0b845b6
--- /dev/null
+++ b/gen/bcm/armv4-mont-linux.S
@@ -0,0 +1,939 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl bn_mul_mont_nohw
+.hidden bn_mul_mont_nohw
+.type bn_mul_mont_nohw,%function
+
+.align 5
+bn_mul_mont_nohw:
+ ldr ip,[sp,#4] @ load num
+ stmdb sp!,{r0,r2} @ sp points at argument block
+ cmp ip,#2
+ mov r0,ip @ load num
+#ifdef __thumb2__
+ ittt lt
+#endif
+ movlt r0,#0
+ addlt sp,sp,#2*4
+ blt .Labrt
+
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers
+
+ mov r0,r0,lsl#2 @ rescale r0 for byte count
+ sub sp,sp,r0 @ alloca(4*num)
+ sub sp,sp,#4 @ +extra dword
+ sub r0,r0,#4 @ "num=num-1"
+ add r4,r2,r0 @ &bp[num-1]
+
+ add r0,sp,r0 @ r0 to point at &tp[num-1]
+ ldr r8,[r0,#14*4] @ &n0
+ ldr r2,[r2] @ bp[0]
+ ldr r5,[r1],#4 @ ap[0],ap++
+ ldr r6,[r3],#4 @ np[0],np++
+ ldr r8,[r8] @ *n0
+ str r4,[r0,#15*4] @ save &bp[num]
+
+ umull r10,r11,r5,r2 @ ap[0]*bp[0]
+ str r8,[r0,#14*4] @ save n0 value
+ mul r8,r10,r8 @ "tp[0]"*n0
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
+ mov r4,sp
+
+.L1st:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ mov r10,r11
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[0]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne .L1st
+
+ adds r12,r12,r11
+ ldr r4,[r0,#13*4] @ restore bp
+ mov r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ mov r7,sp
+ str r14,[r0,#4] @ tp[num]=
+
+.Louter:
+ sub r7,r0,r7 @ "original" r0-1 value
+ sub r1,r1,r7 @ "rewind" ap to &ap[1]
+ ldr r2,[r4,#4]! @ *(++bp)
+ sub r3,r3,r7 @ "rewind" np to &np[1]
+ ldr r5,[r1,#-4] @ ap[0]
+ ldr r10,[sp] @ tp[0]
+ ldr r6,[r3,#-4] @ np[0]
+ ldr r7,[sp,#4] @ tp[1]
+
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
+ str r4,[r0,#13*4] @ save bp
+ mul r8,r10,r8
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
+ mov r4,sp
+
+.Linner:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ adds r10,r11,r7 @ +=tp[j]
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[i]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adc r11,r11,#0
+ ldr r7,[r4,#8] @ tp[j+1]
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne .Linner
+
+ adds r12,r12,r11
+ mov r14,#0
+ ldr r4,[r0,#13*4] @ restore bp
+ adc r14,r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adds r12,r12,r7
+ ldr r7,[r0,#15*4] @ restore &bp[num]
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ str r14,[r0,#4] @ tp[num]=
+
+ cmp r4,r7
+#ifdef __thumb2__
+ itt ne
+#endif
+ movne r7,sp
+ bne .Louter
+
+ ldr r2,[r0,#12*4] @ pull rp
+ mov r5,sp
+ add r0,r0,#4 @ r0 to point at &tp[num]
+ sub r5,r0,r5 @ "original" num value
+ mov r4,sp @ "rewind" r4
+ mov r1,r4 @ "borrow" r1
+ sub r3,r3,r5 @ "rewind" r3 to &np[0]
+
+ subs r7,r7,r7 @ "clear" carry flag
+.Lsub: ldr r7,[r4],#4
+ ldr r6,[r3],#4
+ sbcs r7,r7,r6 @ tp[j]-np[j]
+ str r7,[r2],#4 @ rp[j]=
+ teq r4,r0 @ preserve carry
+ bne .Lsub
+ sbcs r14,r14,#0 @ upmost carry
+ mov r4,sp @ "rewind" r4
+ sub r2,r2,r5 @ "rewind" r2
+
+.Lcopy: ldr r7,[r4] @ conditional copy
+ ldr r5,[r2]
+ str sp,[r4],#4 @ zap tp
+#ifdef __thumb2__
+ it cc
+#endif
+ movcc r5,r7
+ str r5,[r2],#4
+ teq r4,r0 @ preserve carry
+ bne .Lcopy
+
+ mov sp,r0
+ add sp,sp,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+.Labrt:
+#if __ARM_ARCH>=5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size bn_mul_mont_nohw,.-bn_mul_mont_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl bn_mul8x_mont_neon
+.hidden bn_mul8x_mont_neon
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldmia ip,{r4,r5} @ load rest of parameter block
+ mov ip,sp
+
+ cmp r5,#8
+ bhi .LNEON_8n
+
+ @ special case for r5==8, everything is in register bank...
+
+ vld1.32 {d28[0]}, [r2,:32]!
+ veor d8,d8,d8
+ sub r7,sp,r5,lsl#4
+ vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-(
+ and r7,r7,#-64
+ vld1.32 {d30[0]}, [r4,:32]
+ mov sp,r7 @ alloca
+ vzip.16 d28,d8
+
+ vmull.u32 q6,d28,d0[0]
+ vmull.u32 q7,d28,d0[1]
+ vmull.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmull.u32 q9,d28,d1[1]
+
+ vadd.u64 d29,d29,d12
+ veor d8,d8,d8
+ vmul.u32 d29,d29,d30
+
+ vmull.u32 q10,d28,d2[0]
+ vld1.32 {d4,d5,d6,d7}, [r3]!
+ vmull.u32 q11,d28,d2[1]
+ vmull.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmull.u32 q13,d28,d3[1]
+
+ vmlal.u32 q6,d29,d4[0]
+ sub r9,r5,#1
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ b .LNEON_outer8
+
+.align 4
+.LNEON_outer8:
+ vld1.32 {d28[0]}, [r2,:32]!
+ veor d8,d8,d8
+ vzip.16 d28,d8
+ vadd.u64 d12,d12,d10
+
+ vmlal.u32 q6,d28,d0[0]
+ vmlal.u32 q7,d28,d0[1]
+ vmlal.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmlal.u32 q9,d28,d1[1]
+
+ vadd.u64 d29,d29,d12
+ veor d8,d8,d8
+ subs r9,r9,#1
+ vmul.u32 d29,d29,d30
+
+ vmlal.u32 q10,d28,d2[0]
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+
+ vmlal.u32 q6,d29,d4[0]
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ bne .LNEON_outer8
+
+ vadd.u64 d12,d12,d10
+ mov r7,sp
+ vshr.u64 d10,d12,#16
+ mov r8,r5
+ vadd.u64 d13,d13,d10
+ add r6,sp,#96
+ vshr.u64 d10,d13,#16
+ vzip.16 d12,d13
+
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_8n:
+ veor q6,q6,q6
+ sub r7,sp,#128
+ veor q7,q7,q7
+ sub r7,r7,r5,lsl#4
+ veor q8,q8,q8
+ and r7,r7,#-64
+ veor q9,q9,q9
+ mov sp,r7 @ alloca
+ veor q10,q10,q10
+ add r7,r7,#256
+ veor q11,q11,q11
+ sub r8,r5,#8
+ veor q12,q12,q12
+ veor q13,q13,q13
+
+.LNEON_8n_init:
+ vst1.64 {q6,q7},[r7,:256]!
+ subs r8,r8,#8
+ vst1.64 {q8,q9},[r7,:256]!
+ vst1.64 {q10,q11},[r7,:256]!
+ vst1.64 {q12,q13},[r7,:256]!
+ bne .LNEON_8n_init
+
+ add r6,sp,#256
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ add r10,sp,#8
+ vld1.32 {d30[0]},[r4,:32]
+ mov r9,r5
+ b .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ veor d8,d8,d8
+ vzip.16 d28,d8
+ add r7,sp,#128
+ vld1.32 {d4,d5,d6,d7},[r3]!
+
+ vmlal.u32 q6,d28,d0[0]
+ vmlal.u32 q7,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmlal.u32 q9,d28,d1[1]
+ vadd.u64 d29,d29,d12
+ vmlal.u32 q10,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q11,d28,d2[1]
+ vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q6,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q7,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q8,d29,d5[0]
+ vshr.u64 d12,d12,#16
+ vmlal.u32 q9,d29,d5[1]
+ vmlal.u32 q10,d29,d6[0]
+ vadd.u64 d12,d12,d13
+ vmlal.u32 q11,d29,d6[1]
+ vshr.u64 d12,d12,#16
+ vmlal.u32 q12,d29,d7[0]
+ vmlal.u32 q13,d29,d7[1]
+ vadd.u64 d14,d14,d12
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0]
+ vmlal.u32 q7,d28,d0[0]
+ vld1.64 {q6},[r6,:128]!
+ vmlal.u32 q8,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q9,d28,d1[0]
+ vshl.i64 d29,d15,#16
+ vmlal.u32 q10,d28,d1[1]
+ vadd.u64 d29,d29,d14
+ vmlal.u32 q11,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q12,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1]
+ vmlal.u32 q13,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q6,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q7,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q8,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q9,d29,d5[0]
+ vshr.u64 d14,d14,#16
+ vmlal.u32 q10,d29,d5[1]
+ vmlal.u32 q11,d29,d6[0]
+ vadd.u64 d14,d14,d15
+ vmlal.u32 q12,d29,d6[1]
+ vshr.u64 d14,d14,#16
+ vmlal.u32 q13,d29,d7[0]
+ vmlal.u32 q6,d29,d7[1]
+ vadd.u64 d16,d16,d14
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1]
+ vmlal.u32 q8,d28,d0[0]
+ vld1.64 {q7},[r6,:128]!
+ vmlal.u32 q9,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q10,d28,d1[0]
+ vshl.i64 d29,d17,#16
+ vmlal.u32 q11,d28,d1[1]
+ vadd.u64 d29,d29,d16
+ vmlal.u32 q12,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q13,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2]
+ vmlal.u32 q6,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q7,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q8,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q9,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q10,d29,d5[0]
+ vshr.u64 d16,d16,#16
+ vmlal.u32 q11,d29,d5[1]
+ vmlal.u32 q12,d29,d6[0]
+ vadd.u64 d16,d16,d17
+ vmlal.u32 q13,d29,d6[1]
+ vshr.u64 d16,d16,#16
+ vmlal.u32 q6,d29,d7[0]
+ vmlal.u32 q7,d29,d7[1]
+ vadd.u64 d18,d18,d16
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2]
+ vmlal.u32 q9,d28,d0[0]
+ vld1.64 {q8},[r6,:128]!
+ vmlal.u32 q10,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q11,d28,d1[0]
+ vshl.i64 d29,d19,#16
+ vmlal.u32 q12,d28,d1[1]
+ vadd.u64 d29,d29,d18
+ vmlal.u32 q13,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q6,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3]
+ vmlal.u32 q7,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q8,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q9,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q10,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q11,d29,d5[0]
+ vshr.u64 d18,d18,#16
+ vmlal.u32 q12,d29,d5[1]
+ vmlal.u32 q13,d29,d6[0]
+ vadd.u64 d18,d18,d19
+ vmlal.u32 q6,d29,d6[1]
+ vshr.u64 d18,d18,#16
+ vmlal.u32 q7,d29,d7[0]
+ vmlal.u32 q8,d29,d7[1]
+ vadd.u64 d20,d20,d18
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3]
+ vmlal.u32 q10,d28,d0[0]
+ vld1.64 {q9},[r6,:128]!
+ vmlal.u32 q11,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q12,d28,d1[0]
+ vshl.i64 d29,d21,#16
+ vmlal.u32 q13,d28,d1[1]
+ vadd.u64 d29,d29,d20
+ vmlal.u32 q6,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q7,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4]
+ vmlal.u32 q8,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q9,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q10,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q11,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q12,d29,d5[0]
+ vshr.u64 d20,d20,#16
+ vmlal.u32 q13,d29,d5[1]
+ vmlal.u32 q6,d29,d6[0]
+ vadd.u64 d20,d20,d21
+ vmlal.u32 q7,d29,d6[1]
+ vshr.u64 d20,d20,#16
+ vmlal.u32 q8,d29,d7[0]
+ vmlal.u32 q9,d29,d7[1]
+ vadd.u64 d22,d22,d20
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4]
+ vmlal.u32 q11,d28,d0[0]
+ vld1.64 {q10},[r6,:128]!
+ vmlal.u32 q12,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q13,d28,d1[0]
+ vshl.i64 d29,d23,#16
+ vmlal.u32 q6,d28,d1[1]
+ vadd.u64 d29,d29,d22
+ vmlal.u32 q7,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q8,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5]
+ vmlal.u32 q9,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q10,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q11,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q12,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q13,d29,d5[0]
+ vshr.u64 d22,d22,#16
+ vmlal.u32 q6,d29,d5[1]
+ vmlal.u32 q7,d29,d6[0]
+ vadd.u64 d22,d22,d23
+ vmlal.u32 q8,d29,d6[1]
+ vshr.u64 d22,d22,#16
+ vmlal.u32 q9,d29,d7[0]
+ vmlal.u32 q10,d29,d7[1]
+ vadd.u64 d24,d24,d22
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5]
+ vmlal.u32 q12,d28,d0[0]
+ vld1.64 {q11},[r6,:128]!
+ vmlal.u32 q13,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q6,d28,d1[0]
+ vshl.i64 d29,d25,#16
+ vmlal.u32 q7,d28,d1[1]
+ vadd.u64 d29,d29,d24
+ vmlal.u32 q8,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q9,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6]
+ vmlal.u32 q10,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q11,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q12,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q13,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q6,d29,d5[0]
+ vshr.u64 d24,d24,#16
+ vmlal.u32 q7,d29,d5[1]
+ vmlal.u32 q8,d29,d6[0]
+ vadd.u64 d24,d24,d25
+ vmlal.u32 q9,d29,d6[1]
+ vshr.u64 d24,d24,#16
+ vmlal.u32 q10,d29,d7[0]
+ vmlal.u32 q11,d29,d7[1]
+ vadd.u64 d26,d26,d24
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6]
+ vmlal.u32 q13,d28,d0[0]
+ vld1.64 {q12},[r6,:128]!
+ vmlal.u32 q6,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q7,d28,d1[0]
+ vshl.i64 d29,d27,#16
+ vmlal.u32 q8,d28,d1[1]
+ vadd.u64 d29,d29,d26
+ vmlal.u32 q9,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q10,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7]
+ vmlal.u32 q11,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q12,d28,d3[1]
+ vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 q13,d29,d4[0]
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ vmlal.u32 q6,d29,d4[1]
+ vmlal.u32 q7,d29,d5[0]
+ vshr.u64 d26,d26,#16
+ vmlal.u32 q8,d29,d5[1]
+ vmlal.u32 q9,d29,d6[0]
+ vadd.u64 d26,d26,d27
+ vmlal.u32 q10,d29,d6[1]
+ vshr.u64 d26,d26,#16
+ vmlal.u32 q11,d29,d7[0]
+ vmlal.u32 q12,d29,d7[1]
+ vadd.u64 d12,d12,d26
+ vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7]
+ add r10,sp,#8 @ rewind
+ sub r8,r5,#8
+ b .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+ subs r8,r8,#8
+ vmlal.u32 q6,d28,d0[0]
+ vld1.64 {q13},[r6,:128]
+ vmlal.u32 q7,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0]
+ vmlal.u32 q8,d28,d1[0]
+ vld1.32 {d4,d5,d6,d7},[r3]!
+ vmlal.u32 q9,d28,d1[1]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q10,d28,d2[0]
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vmlal.u32 q13,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1]
+ vmlal.u32 q6,d29,d4[0]
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+ vmlal.u32 q10,d29,d6[0]
+ vmlal.u32 q11,d29,d6[1]
+ vmlal.u32 q12,d29,d7[0]
+ vmlal.u32 q13,d29,d7[1]
+ vst1.64 {q6},[r7,:128]!
+ vmlal.u32 q7,d28,d0[0]
+ vld1.64 {q6},[r6,:128]
+ vmlal.u32 q8,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1]
+ vmlal.u32 q9,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q10,d28,d1[1]
+ vmlal.u32 q11,d28,d2[0]
+ vmlal.u32 q12,d28,d2[1]
+ vmlal.u32 q13,d28,d3[0]
+ vmlal.u32 q6,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2]
+ vmlal.u32 q7,d29,d4[0]
+ vmlal.u32 q8,d29,d4[1]
+ vmlal.u32 q9,d29,d5[0]
+ vmlal.u32 q10,d29,d5[1]
+ vmlal.u32 q11,d29,d6[0]
+ vmlal.u32 q12,d29,d6[1]
+ vmlal.u32 q13,d29,d7[0]
+ vmlal.u32 q6,d29,d7[1]
+ vst1.64 {q7},[r7,:128]!
+ vmlal.u32 q8,d28,d0[0]
+ vld1.64 {q7},[r6,:128]
+ vmlal.u32 q9,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2]
+ vmlal.u32 q10,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q11,d28,d1[1]
+ vmlal.u32 q12,d28,d2[0]
+ vmlal.u32 q13,d28,d2[1]
+ vmlal.u32 q6,d28,d3[0]
+ vmlal.u32 q7,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3]
+ vmlal.u32 q8,d29,d4[0]
+ vmlal.u32 q9,d29,d4[1]
+ vmlal.u32 q10,d29,d5[0]
+ vmlal.u32 q11,d29,d5[1]
+ vmlal.u32 q12,d29,d6[0]
+ vmlal.u32 q13,d29,d6[1]
+ vmlal.u32 q6,d29,d7[0]
+ vmlal.u32 q7,d29,d7[1]
+ vst1.64 {q8},[r7,:128]!
+ vmlal.u32 q9,d28,d0[0]
+ vld1.64 {q8},[r6,:128]
+ vmlal.u32 q10,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3]
+ vmlal.u32 q11,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q12,d28,d1[1]
+ vmlal.u32 q13,d28,d2[0]
+ vmlal.u32 q6,d28,d2[1]
+ vmlal.u32 q7,d28,d3[0]
+ vmlal.u32 q8,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4]
+ vmlal.u32 q9,d29,d4[0]
+ vmlal.u32 q10,d29,d4[1]
+ vmlal.u32 q11,d29,d5[0]
+ vmlal.u32 q12,d29,d5[1]
+ vmlal.u32 q13,d29,d6[0]
+ vmlal.u32 q6,d29,d6[1]
+ vmlal.u32 q7,d29,d7[0]
+ vmlal.u32 q8,d29,d7[1]
+ vst1.64 {q9},[r7,:128]!
+ vmlal.u32 q10,d28,d0[0]
+ vld1.64 {q9},[r6,:128]
+ vmlal.u32 q11,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4]
+ vmlal.u32 q12,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q13,d28,d1[1]
+ vmlal.u32 q6,d28,d2[0]
+ vmlal.u32 q7,d28,d2[1]
+ vmlal.u32 q8,d28,d3[0]
+ vmlal.u32 q9,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5]
+ vmlal.u32 q10,d29,d4[0]
+ vmlal.u32 q11,d29,d4[1]
+ vmlal.u32 q12,d29,d5[0]
+ vmlal.u32 q13,d29,d5[1]
+ vmlal.u32 q6,d29,d6[0]
+ vmlal.u32 q7,d29,d6[1]
+ vmlal.u32 q8,d29,d7[0]
+ vmlal.u32 q9,d29,d7[1]
+ vst1.64 {q10},[r7,:128]!
+ vmlal.u32 q11,d28,d0[0]
+ vld1.64 {q10},[r6,:128]
+ vmlal.u32 q12,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5]
+ vmlal.u32 q13,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q6,d28,d1[1]
+ vmlal.u32 q7,d28,d2[0]
+ vmlal.u32 q8,d28,d2[1]
+ vmlal.u32 q9,d28,d3[0]
+ vmlal.u32 q10,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6]
+ vmlal.u32 q11,d29,d4[0]
+ vmlal.u32 q12,d29,d4[1]
+ vmlal.u32 q13,d29,d5[0]
+ vmlal.u32 q6,d29,d5[1]
+ vmlal.u32 q7,d29,d6[0]
+ vmlal.u32 q8,d29,d6[1]
+ vmlal.u32 q9,d29,d7[0]
+ vmlal.u32 q10,d29,d7[1]
+ vst1.64 {q11},[r7,:128]!
+ vmlal.u32 q12,d28,d0[0]
+ vld1.64 {q11},[r6,:128]
+ vmlal.u32 q13,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6]
+ vmlal.u32 q6,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q7,d28,d1[1]
+ vmlal.u32 q8,d28,d2[0]
+ vmlal.u32 q9,d28,d2[1]
+ vmlal.u32 q10,d28,d3[0]
+ vmlal.u32 q11,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7]
+ vmlal.u32 q12,d29,d4[0]
+ vmlal.u32 q13,d29,d4[1]
+ vmlal.u32 q6,d29,d5[0]
+ vmlal.u32 q7,d29,d5[1]
+ vmlal.u32 q8,d29,d6[0]
+ vmlal.u32 q9,d29,d6[1]
+ vmlal.u32 q10,d29,d7[0]
+ vmlal.u32 q11,d29,d7[1]
+ vst1.64 {q12},[r7,:128]!
+ vmlal.u32 q13,d28,d0[0]
+ vld1.64 {q12},[r6,:128]
+ vmlal.u32 q6,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7]
+ vmlal.u32 q7,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q8,d28,d1[1]
+ vmlal.u32 q9,d28,d2[0]
+ vmlal.u32 q10,d28,d2[1]
+ vmlal.u32 q11,d28,d3[0]
+ vmlal.u32 q12,d28,d3[1]
+ it eq
+ subeq r1,r1,r5,lsl#2 @ rewind
+ vmlal.u32 q13,d29,d4[0]
+ vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 q6,d29,d4[1]
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ vmlal.u32 q7,d29,d5[0]
+ add r10,sp,#8 @ rewind
+ vmlal.u32 q8,d29,d5[1]
+ vmlal.u32 q9,d29,d6[0]
+ vmlal.u32 q10,d29,d6[1]
+ vmlal.u32 q11,d29,d7[0]
+ vst1.64 {q13},[r7,:128]!
+ vmlal.u32 q12,d29,d7[1]
+
+ bne .LNEON_8n_inner
+ add r6,sp,#128
+ vst1.64 {q6,q7},[r7,:256]!
+ veor q2,q2,q2 @ d4-d5
+ vst1.64 {q8,q9},[r7,:256]!
+ veor q3,q3,q3 @ d6-d7
+ vst1.64 {q10,q11},[r7,:256]!
+ vst1.64 {q12},[r7,:128]
+
+ subs r9,r9,#8
+ vld1.64 {q6,q7},[r6,:256]!
+ vld1.64 {q8,q9},[r6,:256]!
+ vld1.64 {q10,q11},[r6,:256]!
+ vld1.64 {q12,q13},[r6,:256]!
+
+ itt ne
+ subne r3,r3,r5,lsl#2 @ rewind
+ bne .LNEON_8n_outer
+
+ add r7,sp,#128
+ vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame
+ vshr.u64 d10,d12,#16
+ vst1.64 {q2,q3},[sp,:256]!
+ vadd.u64 d13,d13,d10
+ vst1.64 {q2,q3}, [sp,:256]!
+ vshr.u64 d10,d13,#16
+ vst1.64 {q2,q3}, [sp,:256]!
+ vzip.16 d12,d13
+
+ mov r8,r5
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+ vadd.u64 d12,d12,d10
+ vshr.u64 d10,d12,#16
+ vld1.64 {q8,q9}, [r6, :256]!
+ vadd.u64 d13,d13,d10
+ vld1.64 {q10,q11}, [r6, :256]!
+ vshr.u64 d10,d13,#16
+ vld1.64 {q12,q13}, [r6, :256]!
+ vzip.16 d12,d13
+
+.LNEON_tail_entry:
+ vadd.u64 d14,d14,d10
+ vst1.32 {d12[0]}, [r7, :32]!
+ vshr.u64 d10,d14,#16
+ vadd.u64 d15,d15,d10
+ vshr.u64 d10,d15,#16
+ vzip.16 d14,d15
+ vadd.u64 d16,d16,d10
+ vst1.32 {d14[0]}, [r7, :32]!
+ vshr.u64 d10,d16,#16
+ vadd.u64 d17,d17,d10
+ vshr.u64 d10,d17,#16
+ vzip.16 d16,d17
+ vadd.u64 d18,d18,d10
+ vst1.32 {d16[0]}, [r7, :32]!
+ vshr.u64 d10,d18,#16
+ vadd.u64 d19,d19,d10
+ vshr.u64 d10,d19,#16
+ vzip.16 d18,d19
+ vadd.u64 d20,d20,d10
+ vst1.32 {d18[0]}, [r7, :32]!
+ vshr.u64 d10,d20,#16
+ vadd.u64 d21,d21,d10
+ vshr.u64 d10,d21,#16
+ vzip.16 d20,d21
+ vadd.u64 d22,d22,d10
+ vst1.32 {d20[0]}, [r7, :32]!
+ vshr.u64 d10,d22,#16
+ vadd.u64 d23,d23,d10
+ vshr.u64 d10,d23,#16
+ vzip.16 d22,d23
+ vadd.u64 d24,d24,d10
+ vst1.32 {d22[0]}, [r7, :32]!
+ vshr.u64 d10,d24,#16
+ vadd.u64 d25,d25,d10
+ vshr.u64 d10,d25,#16
+ vzip.16 d24,d25
+ vadd.u64 d26,d26,d10
+ vst1.32 {d24[0]}, [r7, :32]!
+ vshr.u64 d10,d26,#16
+ vadd.u64 d27,d27,d10
+ vshr.u64 d10,d27,#16
+ vzip.16 d26,d27
+ vld1.64 {q6,q7}, [r6, :256]!
+ subs r8,r8,#8
+ vst1.32 {d26[0]}, [r7, :32]!
+ bne .LNEON_tail
+
+ vst1.32 {d10[0]}, [r7, :32] @ top-most bit
+ sub r3,r3,r5,lsl#2 @ rewind r3
+ subs r1,sp,#0 @ clear carry flag
+ add r2,sp,r5,lsl#2
+
+.LNEON_sub:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r3!, {r8,r9,r10,r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne .LNEON_sub
+
+ ldr r10, [r1] @ load top-most bit
+ mov r11,sp
+ veor q0,q0,q0
+ sub r11,r2,r11 @ this is num*4
+ veor q1,q1,q1
+ mov r1,sp
+ sub r0,r0,r11 @ rewind r0
+ mov r3,r2 @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+.LNEON_copy_n_zap:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r0, {r8,r9,r10,r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ ldmia r1, {r4,r5,r6,r7}
+ stmia r0!, {r8,r9,r10,r11}
+ sub r1,r1,#16
+ ldmia r0, {r8,r9,r10,r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r1,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne .LNEON_copy_n_zap
+
+ mov sp,ip
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ bx lr @ bx lr
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/armv8-mont-apple.S b/gen/bcm/armv8-mont-apple.S
new file mode 100644
index 0000000..cf798a3
--- /dev/null
+++ b/gen/bcm/armv8-mont-apple.S
@@ -0,0 +1,1425 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl _bn_mul_mont
+.private_extern _bn_mul_mont
+
+.align 5
+_bn_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ tst x5,#7
+ b.eq __bn_sqr8x_mont
+ tst x5,#3
+ b.eq __bn_mul4x_mont
+Lmul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ ldr x9,[x2],#8 // bp[0]
+ sub x22,sp,x5,lsl#3
+ ldp x7,x8,[x1],#16 // ap[0..1]
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ and x22,x22,#-16 // ABI says so
+ ldp x13,x14,[x3],#16 // np[0..1]
+
+ mul x6,x7,x9 // ap[0]*bp[0]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ mul x10,x8,x9 // ap[1]*bp[0]
+ umulh x11,x8,x9
+
+ mul x15,x6,x4 // "tp[0]"*n0
+ mov sp,x22 // alloca
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6 // discarded
+ // (*) As for removal of first multiplication and addition
+ // instructions. The outcome of first addition is
+ // guaranteed to be zero, which leaves two computationally
+ // significant outcomes: it either carries or not. Then
+ // question is when does it carry? Is there alternative
+ // way to deduce it? If you follow operations, you can
+ // observe that condition for carry is quite simple:
+ // x6 being non-zero. So that carry can be calculated
+ // by adding -1 to x6. That's what next instruction does.
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ adc x13,x13,xzr
+ cbz x21,L1st_skip
+
+L1st:
+ ldr x8,[x1],#8
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ ldr x14,[x3],#8
+ adds x12,x16,x13
+ mul x10,x8,x9 // ap[j]*bp[0]
+ adc x13,x17,xzr
+ umulh x11,x8,x9
+
+ adds x12,x12,x6
+ mul x16,x14,x15 // np[j]*m1
+ adc x13,x13,xzr
+ umulh x17,x14,x15
+ str x12,[x22],#8 // tp[j-1]
+ cbnz x21,L1st
+
+L1st_skip:
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adc x13,x17,xzr
+
+ adds x12,x12,x6
+ sub x20,x5,#8 // i=num-1
+ adcs x13,x13,x7
+
+ adc x19,xzr,xzr // upmost overflow bit
+ stp x12,x13,[x22]
+
+Louter:
+ ldr x9,[x2],#8 // bp[i]
+ ldp x7,x8,[x1],#16
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+
+ mul x6,x7,x9 // ap[0]*bp[i]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ ldp x13,x14,[x3],#16
+ mul x10,x8,x9 // ap[1]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x15,x6,x4
+ sub x20,x20,#8 // i--
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ cbz x21,Linner_skip
+
+Linner:
+ ldr x8,[x1],#8
+ adc x13,x13,xzr
+ ldr x23,[x22],#8 // tp[j]
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ ldr x14,[x3],#8
+ adc x13,x17,xzr
+
+ mul x10,x8,x9 // ap[j]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x16,x14,x15 // np[j]*m1
+ adds x12,x12,x6
+ umulh x17,x14,x15
+ str x12,[x22,#-16] // tp[j-1]
+ cbnz x21,Linner
+
+Linner_skip:
+ ldr x23,[x22],#8 // tp[j]
+ adc x13,x13,xzr
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adcs x13,x17,x19
+ adc x19,xzr,xzr
+
+ adds x6,x6,x23
+ adc x7,x7,xzr
+
+ adds x12,x12,x6
+ adcs x13,x13,x7
+ adc x19,x19,xzr // upmost overflow bit
+ stp x12,x13,[x22,#-16]
+
+ cbnz x20,Louter
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x14,[x3],#8 // np[0]
+ subs x21,x5,#8 // j=num-1 and clear borrow
+ mov x1,x0
+Lsub:
+ sbcs x8,x23,x14 // tp[j]-np[j]
+ ldr x23,[x22],#8
+ sub x21,x21,#8 // j--
+ ldr x14,[x3],#8
+ str x8,[x1],#8 // rp[j]=tp[j]-np[j]
+ cbnz x21,Lsub
+
+ sbcs x8,x23,x14
+ sbcs x19,x19,xzr // did it borrow?
+ str x8,[x1],#8 // rp[num-1]
+
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x8,[x0],#8 // rp[0]
+ sub x5,x5,#8 // num--
+ nop
+Lcond_copy:
+ sub x5,x5,#8 // num--
+ csel x14,x23,x8,lo // did it borrow?
+ ldr x23,[x22],#8
+ ldr x8,[x0],#8
+ str xzr,[x22,#-16] // wipe tp
+ str x14,[x0,#-16]
+ cbnz x5,Lcond_copy
+
+ csel x14,x23,x8,lo
+ str xzr,[x22,#-8] // wipe tp
+ str x14,[x0,#-8]
+
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.align 5
+__bn_sqr8x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+ // only from bn_mul_mont which has already signed the return address.
+ cmp x1,x2
+ b.ne __bn_mul4x_mont
+Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x3,[sp,#96] // offload rp and np
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ ldp x12,x13,[x1,#8*6]
+
+ sub x2,sp,x5,lsl#4
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ mov sp,x2 // alloca
+ sub x27,x5,#8*8
+ b Lsqr8x_zero_start
+
+Lsqr8x_zero:
+ sub x27,x27,#8*8
+ stp xzr,xzr,[x2,#8*0]
+ stp xzr,xzr,[x2,#8*2]
+ stp xzr,xzr,[x2,#8*4]
+ stp xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+ stp xzr,xzr,[x2,#8*8]
+ stp xzr,xzr,[x2,#8*10]
+ stp xzr,xzr,[x2,#8*12]
+ stp xzr,xzr,[x2,#8*14]
+ add x2,x2,#8*16
+ cbnz x27,Lsqr8x_zero
+
+ add x3,x1,x5
+ add x1,x1,#8*8
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ mov x23,xzr
+ mov x24,xzr
+ mov x25,xzr
+ mov x26,xzr
+ mov x2,sp
+ str x4,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
+ mul x15,x8,x6
+ mul x16,x9,x6
+ mul x17,x10,x6
+ adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
+ mul x14,x11,x6
+ adcs x21,x21,x15
+ mul x15,x12,x6
+ adcs x22,x22,x16
+ mul x16,x13,x6
+ adcs x23,x23,x17
+ umulh x17,x7,x6 // hi(a[1..7]*a[0])
+ adcs x24,x24,x14
+ umulh x14,x8,x6
+ adcs x25,x25,x15
+ umulh x15,x9,x6
+ adcs x26,x26,x16
+ umulh x16,x10,x6
+ stp x19,x20,[x2],#8*2 // t[0..1]
+ adc x19,xzr,xzr // t[8]
+ adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
+ umulh x17,x11,x6
+ adcs x22,x22,x14
+ umulh x14,x12,x6
+ adcs x23,x23,x15
+ umulh x15,x13,x6
+ adcs x24,x24,x16
+ mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
+ adcs x25,x25,x17
+ mul x17,x9,x7
+ adcs x26,x26,x14
+ mul x14,x10,x7
+ adc x19,x19,x15
+
+ mul x15,x11,x7
+ adds x22,x22,x16
+ mul x16,x12,x7
+ adcs x23,x23,x17
+ mul x17,x13,x7
+ adcs x24,x24,x14
+ umulh x14,x8,x7 // hi(a[2..7]*a[1])
+ adcs x25,x25,x15
+ umulh x15,x9,x7
+ adcs x26,x26,x16
+ umulh x16,x10,x7
+ adcs x19,x19,x17
+ umulh x17,x11,x7
+ stp x21,x22,[x2],#8*2 // t[2..3]
+ adc x20,xzr,xzr // t[9]
+ adds x23,x23,x14
+ umulh x14,x12,x7
+ adcs x24,x24,x15
+ umulh x15,x13,x7
+ adcs x25,x25,x16
+ mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
+ adcs x26,x26,x17
+ mul x17,x10,x8
+ adcs x19,x19,x14
+ mul x14,x11,x8
+ adc x20,x20,x15
+
+ mul x15,x12,x8
+ adds x24,x24,x16
+ mul x16,x13,x8
+ adcs x25,x25,x17
+ umulh x17,x9,x8 // hi(a[3..7]*a[2])
+ adcs x26,x26,x14
+ umulh x14,x10,x8
+ adcs x19,x19,x15
+ umulh x15,x11,x8
+ adcs x20,x20,x16
+ umulh x16,x12,x8
+ stp x23,x24,[x2],#8*2 // t[4..5]
+ adc x21,xzr,xzr // t[10]
+ adds x25,x25,x17
+ umulh x17,x13,x8
+ adcs x26,x26,x14
+ mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
+ adcs x19,x19,x15
+ mul x15,x11,x9
+ adcs x20,x20,x16
+ mul x16,x12,x9
+ adc x21,x21,x17
+
+ mul x17,x13,x9
+ adds x26,x26,x14
+ umulh x14,x10,x9 // hi(a[4..7]*a[3])
+ adcs x19,x19,x15
+ umulh x15,x11,x9
+ adcs x20,x20,x16
+ umulh x16,x12,x9
+ adcs x21,x21,x17
+ umulh x17,x13,x9
+ stp x25,x26,[x2],#8*2 // t[6..7]
+ adc x22,xzr,xzr // t[11]
+ adds x19,x19,x14
+ mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
+ adcs x20,x20,x15
+ mul x15,x12,x10
+ adcs x21,x21,x16
+ mul x16,x13,x10
+ adc x22,x22,x17
+
+ umulh x17,x11,x10 // hi(a[5..7]*a[4])
+ adds x20,x20,x14
+ umulh x14,x12,x10
+ adcs x21,x21,x15
+ umulh x15,x13,x10
+ adcs x22,x22,x16
+ mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
+ adc x23,xzr,xzr // t[12]
+ adds x21,x21,x17
+ mul x17,x13,x11
+ adcs x22,x22,x14
+ umulh x14,x12,x11 // hi(a[6..7]*a[5])
+ adc x23,x23,x15
+
+ umulh x15,x13,x11
+ adds x22,x22,x16
+ mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
+ adcs x23,x23,x17
+ umulh x17,x13,x12 // hi(a[7]*a[6])
+ adc x24,xzr,xzr // t[13]
+ adds x23,x23,x14
+ sub x27,x3,x1 // done yet?
+ adc x24,x24,x15
+
+ adds x24,x24,x16
+ sub x14,x3,x5 // rewinded ap
+ adc x25,xzr,xzr // t[14]
+ add x25,x25,x17
+
+ cbz x27,Lsqr8x_outer_break
+
+ mov x4,x6
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x0,x1
+ adcs x26,xzr,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved below
+ mov x27,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+Lsqr8x_mul:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp x1,x3 // done yet?
+ b.eq Lsqr8x_break
+
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ ldr x4,[x0,#-8*8]
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b Lsqr8x_mul
+
+.align 4
+Lsqr8x_break:
+ ldp x6,x7,[x0,#8*0]
+ add x1,x0,#8*8
+ ldp x8,x9,[x0,#8*2]
+ sub x14,x3,x1 // is it last iteration?
+ ldp x10,x11,[x0,#8*4]
+ sub x15,x2,x14
+ ldp x12,x13,[x0,#8*6]
+ cbz x14,Lsqr8x_outer_loop
+
+ stp x19,x20,[x2,#8*0]
+ ldp x19,x20,[x15,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x15,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x15,#8*4]
+ stp x25,x26,[x2,#8*6]
+ mov x2,x15
+ ldp x25,x26,[x15,#8*6]
+ b Lsqr8x_outer_loop
+
+.align 4
+Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
+ ldp x15,x16,[sp,#8*1]
+ ldp x11,x13,[x14,#8*2]
+ add x1,x14,#8*4
+ ldp x17,x14,[sp,#8*3]
+
+ stp x19,x20,[x2,#8*0]
+ mul x19,x7,x7
+ stp x21,x22,[x2,#8*2]
+ umulh x7,x7,x7
+ stp x23,x24,[x2,#8*4]
+ mul x8,x9,x9
+ stp x25,x26,[x2,#8*6]
+ mov x2,sp
+ umulh x9,x9,x9
+ adds x20,x7,x15,lsl#1
+ extr x15,x16,x15,#63
+ sub x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ sub x27,x27,#8*4
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ ldp x7,x9,[x1],#8*2
+ umulh x11,x11,x11
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ extr x17,x14,x17,#63
+ stp x19,x20,[x2,#8*0]
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ stp x21,x22,[x2,#8*2]
+ adcs x24,x11,x14
+ ldp x17,x14,[x2,#8*7]
+ extr x15,x16,x15,#63
+ adcs x25,x12,x15
+ extr x16,x17,x16,#63
+ adcs x26,x13,x16
+ ldp x15,x16,[x2,#8*9]
+ mul x6,x7,x7
+ ldp x11,x13,[x1],#8*2
+ umulh x7,x7,x7
+ mul x8,x9,x9
+ umulh x9,x9,x9
+ stp x23,x24,[x2,#8*4]
+ extr x17,x14,x17,#63
+ stp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ adcs x19,x6,x17
+ extr x14,x15,x14,#63
+ adcs x20,x7,x14
+ ldp x17,x14,[x2,#8*3]
+ extr x15,x16,x15,#63
+ cbnz x27,Lsqr4x_shift_n_add
+ ldp x1,x4,[x29,#104] // pull np and n0
+
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ umulh x11,x11,x11
+ stp x19,x20,[x2,#8*0]
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ stp x21,x22,[x2,#8*2]
+ extr x17,x14,x17,#63
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ ldp x19,x20,[sp,#8*0]
+ adcs x24,x11,x14
+ extr x15,x16,x15,#63
+ ldp x6,x7,[x1,#8*0]
+ adcs x25,x12,x15
+ extr x16,xzr,x16,#63
+ ldp x8,x9,[x1,#8*2]
+ adc x26,x13,x16
+ ldp x10,x11,[x1,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul x28,x4,x19 // t[0]*n0
+ ldp x12,x13,[x1,#8*6]
+ add x3,x1,x5
+ ldp x21,x22,[sp,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[sp,#8*4]
+ stp x25,x26,[x2,#8*6]
+ ldp x25,x26,[sp,#8*6]
+ add x1,x1,#8*8
+ mov x30,xzr // initial top-most carry
+ mov x2,sp
+ mov x27,#8
+
+Lsqr8x_reduction:
+ // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
+ mul x15,x7,x28
+ sub x27,x27,#1
+ mul x16,x8,x28
+ str x28,[x2],#8 // put aside t[0]*n0 for tail processing
+ mul x17,x9,x28
+ // (*) adds xzr,x19,x14
+ subs xzr,x19,#1 // (*)
+ mul x14,x10,x28
+ adcs x19,x20,x15
+ mul x15,x11,x28
+ adcs x20,x21,x16
+ mul x16,x12,x28
+ adcs x21,x22,x17
+ mul x17,x13,x28
+ adcs x22,x23,x14
+ umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
+ adcs x23,x24,x15
+ umulh x15,x7,x28
+ adcs x24,x25,x16
+ umulh x16,x8,x28
+ adcs x25,x26,x17
+ umulh x17,x9,x28
+ adc x26,xzr,xzr
+ adds x19,x19,x14
+ umulh x14,x10,x28
+ adcs x20,x20,x15
+ umulh x15,x11,x28
+ adcs x21,x21,x16
+ umulh x16,x12,x28
+ adcs x22,x22,x17
+ umulh x17,x13,x28
+ mul x28,x4,x19 // next t[0]*n0
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adc x26,x26,x17
+ cbnz x27,Lsqr8x_reduction
+
+ ldp x14,x15,[x2,#8*0]
+ ldp x16,x17,[x2,#8*2]
+ mov x0,x2
+ sub x27,x3,x1 // done yet?
+ adds x19,x19,x14
+ adcs x20,x20,x15
+ ldp x14,x15,[x2,#8*4]
+ adcs x21,x21,x16
+ adcs x22,x22,x17
+ ldp x16,x17,[x2,#8*6]
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adcs x26,x26,x17
+ //adc x28,xzr,xzr // moved below
+ cbz x27,Lsqr8x8_post_condition
+
+ ldr x4,[x2,#-8*8]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ mov x27,#-8*8
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+
+Lsqr8x_tail:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp x6,x7,[x2,#8*0]
+ sub x27,x3,x1 // done yet?
+ sub x16,x3,x5 // rewinded np
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ cbz x27,Lsqr8x_tail_break
+
+ ldr x4,[x0,#-8*8]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b Lsqr8x_tail
+
+.align 4
+Lsqr8x_tail_break:
+ ldr x4,[x29,#112] // pull n0
+ add x27,x2,#8*8 // end of current t[num] window
+
+ subs xzr,x30,#1 // "move" top-most carry to carry bit
+ adcs x14,x19,x6
+ adcs x15,x20,x7
+ ldp x19,x20,[x0,#8*0]
+ adcs x21,x21,x8
+ ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
+ adcs x22,x22,x9
+ ldp x8,x9,[x16,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x16,#8*4]
+ adcs x25,x25,x12
+ adcs x26,x26,x13
+ ldp x12,x13,[x16,#8*6]
+ add x1,x16,#8*8
+ adc x30,xzr,xzr // top-most carry
+ mul x28,x4,x19
+ stp x14,x15,[x2,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x0,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x0,#8*4]
+ cmp x27,x29 // did we hit the bottom?
+ stp x25,x26,[x2,#8*6]
+ mov x2,x0 // slide the window
+ ldp x25,x26,[x0,#8*6]
+ mov x27,#8
+ b.ne Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x0,[x29,#96] // pull rp
+ add x2,x2,#8*8
+ subs x14,x19,x6
+ sbcs x15,x20,x7
+ sub x27,x5,#8*8
+ mov x3,x0 // x0 copy
+
+Lsqr8x_sub:
+ sbcs x16,x21,x8
+ ldp x6,x7,[x1,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x1,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x10,x11,[x1,#8*4]
+ sbcs x17,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ ldp x19,x20,[x2,#8*0]
+ sub x27,x27,#8*8
+ ldp x21,x22,[x2,#8*2]
+ ldp x23,x24,[x2,#8*4]
+ ldp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ stp x14,x15,[x0,#8*4]
+ sbcs x14,x19,x6
+ stp x16,x17,[x0,#8*6]
+ add x0,x0,#8*8
+ sbcs x15,x20,x7
+ cbnz x27,Lsqr8x_sub
+
+ sbcs x16,x21,x8
+ mov x2,sp
+ add x1,sp,x5
+ ldp x6,x7,[x3,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x3,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x19,x20,[x1,#8*0]
+ sbcs x17,x26,x13
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp x14,x15,[x0,#8*4]
+ stp x16,x17,[x0,#8*6]
+
+ sub x27,x5,#8*4
+Lsqr4x_cond_copy:
+ sub x27,x27,#8*4
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ ldp x6,x7,[x3,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x16,x21,x8,lo
+ stp xzr,xzr,[x2,#8*2]
+ add x2,x2,#8*4
+ csel x17,x22,x9,lo
+ ldp x8,x9,[x3,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ stp xzr,xzr,[x1,#8*0]
+ stp xzr,xzr,[x1,#8*2]
+ cbnz x27,Lsqr4x_cond_copy
+
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ stp xzr,xzr,[x2,#8*2]
+ csel x16,x21,x8,lo
+ csel x17,x22,x9,lo
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+
+ b Lsqr8x_done
+
+.align 4
+Lsqr8x8_post_condition:
+ adc x28,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // x19-7,x28 hold result, x6-7 hold modulus
+ subs x6,x19,x6
+ ldr x1,[x29,#96] // pull rp
+ sbcs x7,x20,x7
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x8
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x9
+ stp xzr,xzr,[sp,#8*4]
+ sbcs x10,x23,x10
+ stp xzr,xzr,[sp,#8*6]
+ sbcs x11,x24,x11
+ stp xzr,xzr,[sp,#8*8]
+ sbcs x12,x25,x12
+ stp xzr,xzr,[sp,#8*10]
+ sbcs x13,x26,x13
+ stp xzr,xzr,[sp,#8*12]
+ sbcs x28,x28,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // x6-7 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ csel x10,x23,x10,lo
+ csel x11,x24,x11,lo
+ stp x8,x9,[x1,#8*2]
+ csel x12,x25,x12,lo
+ csel x13,x26,x13,lo
+ stp x10,x11,[x1,#8*4]
+ stp x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.align 5
+__bn_mul4x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+ // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+ // return address.
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ sub x26,sp,x5,lsl#3
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ sub sp,x26,#8*4 // alloca
+
+ add x10,x2,x5
+ add x27,x1,x5
+ stp x0,x10,[x29,#96] // offload rp and &b[num]
+
+ ldr x24,[x2,#8*0] // b[0]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x28,#0
+ mov x26,sp
+
+Loop_mul4x_1st_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[0])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[0])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ sub x10,x27,x1
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_1st_reduction
+
+ cbz x10,Lmul4x4_post_condition
+
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldr x25,[sp] // a[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[i])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[i])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ adcs x23,x23,x0
+ umulh x13,x17,x25
+ adc x0,xzr,xzr
+ ldr x25,[sp,x28] // next t[0]*n0
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_1st_tail
+
+ sub x11,x27,x5 // rewinded x1
+ cbz x10,Lmul4x_proceed
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b Loop_mul4x_1st_tail
+
+.align 5
+Lmul4x_proceed:
+ ldr x24,[x2,#8*4]! // *++b
+ adc x30,x0,xzr
+ ldp x6,x7,[x11,#8*0] // a[0..3]
+ sub x3,x3,x5 // rewind np
+ ldp x8,x9,[x11,#8*2]
+ add x1,x11,#8*4
+
+ stp x19,x20,[x26,#8*0] // result!!!
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ stp x21,x22,[x26,#8*2] // result!!!
+ ldp x21,x22,[sp,#8*6]
+
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ mov x26,sp
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+
+.align 4
+Loop_mul4x_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[4])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_reduction
+
+ adc x0,x0,xzr
+ ldp x10,x11,[x26,#8*4] // t[4..7]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+
+ ldr x25,[sp] // t[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.align 4
+Loop_mul4x_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[4])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ umulh x13,x17,x25
+ adcs x23,x23,x0
+ ldr x25,[sp,x28] // next a[0]*n0
+ adc x0,xzr,xzr
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_tail
+
+ sub x11,x3,x5 // rewinded np?
+ adc x0,x0,xzr
+ cbz x10,Loop_mul4x_break
+
+ ldp x10,x11,[x26,#8*4]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b Loop_mul4x_tail
+
+.align 4
+Loop_mul4x_break:
+ ldp x12,x13,[x29,#96] // pull rp and &b[num]
+ adds x19,x19,x30
+ add x2,x2,#8*4 // bp++
+ adcs x20,x20,xzr
+ sub x1,x1,x5 // rewind ap
+ adcs x21,x21,xzr
+ stp x19,x20,[x26,#8*0] // result!!!
+ adcs x22,x22,xzr
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ adc x30,x0,xzr
+ stp x21,x22,[x26,#8*2] // result!!!
+ cmp x2,x13 // done yet?
+ ldp x21,x22,[sp,#8*6]
+ ldp x14,x15,[x11,#8*0] // n[0..3]
+ ldp x16,x17,[x11,#8*2]
+ add x3,x11,#8*4
+ b.eq Lmul4x_post
+
+ ldr x24,[x2]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ adds x1,x1,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x26,sp
+ b Loop_mul4x_reduction
+
+.align 4
+Lmul4x_post:
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ mov x0,x12
+ mov x27,x12 // x0 copy
+ subs x10,x19,x14
+ add x26,sp,#8*8
+ sbcs x11,x20,x15
+ sub x28,x5,#8*4
+
+Lmul4x_sub:
+ sbcs x12,x21,x16
+ ldp x14,x15,[x3,#8*0]
+ sub x28,x28,#8*4
+ ldp x19,x20,[x26,#8*0]
+ sbcs x13,x22,x17
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ ldp x21,x22,[x26,#8*2]
+ add x26,x26,#8*4
+ stp x10,x11,[x0,#8*0]
+ sbcs x10,x19,x14
+ stp x12,x13,[x0,#8*2]
+ add x0,x0,#8*4
+ sbcs x11,x20,x15
+ cbnz x28,Lmul4x_sub
+
+ sbcs x12,x21,x16
+ mov x26,sp
+ add x1,sp,#8*4
+ ldp x6,x7,[x27,#8*0]
+ sbcs x13,x22,x17
+ stp x10,x11,[x0,#8*0]
+ ldp x8,x9,[x27,#8*2]
+ stp x12,x13,[x0,#8*2]
+ ldp x19,x20,[x1,#8*0]
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+
+ sub x28,x5,#8*4
+Lmul4x_cond_copy:
+ sub x28,x28,#8*4
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ ldp x6,x7,[x27,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*2]
+ add x26,x26,#8*4
+ csel x13,x22,x9,lo
+ ldp x8,x9,[x27,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+ add x27,x27,#8*4
+ cbnz x28,Lmul4x_cond_copy
+
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ stp xzr,xzr,[x26,#8*2]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*3]
+ csel x13,x22,x9,lo
+ stp xzr,xzr,[x26,#8*4]
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+
+ b Lmul4x_done
+
+.align 4
+Lmul4x4_post_condition:
+ adc x0,x0,xzr
+ ldr x1,[x29,#96] // pull rp
+ // x19-3,x0 hold result, x14-7 hold modulus
+ subs x6,x19,x14
+ ldr x30,[x29,#8] // pull return address
+ sbcs x7,x20,x15
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x16
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x17
+ stp xzr,xzr,[sp,#8*4]
+ sbcs xzr,x0,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*6]
+
+ // x6-3 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ stp x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 4
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/armv8-mont-linux.S b/gen/bcm/armv8-mont-linux.S
new file mode 100644
index 0000000..13f045c
--- /dev/null
+++ b/gen/bcm/armv8-mont-linux.S
@@ -0,0 +1,1425 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,%function
+.align 5
+bn_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ tst x5,#7
+ b.eq __bn_sqr8x_mont
+ tst x5,#3
+ b.eq __bn_mul4x_mont
+.Lmul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ ldr x9,[x2],#8 // bp[0]
+ sub x22,sp,x5,lsl#3
+ ldp x7,x8,[x1],#16 // ap[0..1]
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ and x22,x22,#-16 // ABI says so
+ ldp x13,x14,[x3],#16 // np[0..1]
+
+ mul x6,x7,x9 // ap[0]*bp[0]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ mul x10,x8,x9 // ap[1]*bp[0]
+ umulh x11,x8,x9
+
+ mul x15,x6,x4 // "tp[0]"*n0
+ mov sp,x22 // alloca
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6 // discarded
+ // (*) As for removal of first multiplication and addition
+ // instructions. The outcome of first addition is
+ // guaranteed to be zero, which leaves two computationally
+ // significant outcomes: it either carries or not. Then
+ // question is when does it carry? Is there alternative
+ // way to deduce it? If you follow operations, you can
+ // observe that condition for carry is quite simple:
+ // x6 being non-zero. So that carry can be calculated
+ // by adding -1 to x6. That's what next instruction does.
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ adc x13,x13,xzr
+ cbz x21,.L1st_skip
+
+.L1st:
+ ldr x8,[x1],#8
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ ldr x14,[x3],#8
+ adds x12,x16,x13
+ mul x10,x8,x9 // ap[j]*bp[0]
+ adc x13,x17,xzr
+ umulh x11,x8,x9
+
+ adds x12,x12,x6
+ mul x16,x14,x15 // np[j]*m1
+ adc x13,x13,xzr
+ umulh x17,x14,x15
+ str x12,[x22],#8 // tp[j-1]
+ cbnz x21,.L1st
+
+.L1st_skip:
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adc x13,x17,xzr
+
+ adds x12,x12,x6
+ sub x20,x5,#8 // i=num-1
+ adcs x13,x13,x7
+
+ adc x19,xzr,xzr // upmost overflow bit
+ stp x12,x13,[x22]
+
+.Louter:
+ ldr x9,[x2],#8 // bp[i]
+ ldp x7,x8,[x1],#16
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+
+ mul x6,x7,x9 // ap[0]*bp[i]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ ldp x13,x14,[x3],#16
+ mul x10,x8,x9 // ap[1]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x15,x6,x4
+ sub x20,x20,#8 // i--
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ cbz x21,.Linner_skip
+
+.Linner:
+ ldr x8,[x1],#8
+ adc x13,x13,xzr
+ ldr x23,[x22],#8 // tp[j]
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ ldr x14,[x3],#8
+ adc x13,x17,xzr
+
+ mul x10,x8,x9 // ap[j]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x16,x14,x15 // np[j]*m1
+ adds x12,x12,x6
+ umulh x17,x14,x15
+ str x12,[x22,#-16] // tp[j-1]
+ cbnz x21,.Linner
+
+.Linner_skip:
+ ldr x23,[x22],#8 // tp[j]
+ adc x13,x13,xzr
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adcs x13,x17,x19
+ adc x19,xzr,xzr
+
+ adds x6,x6,x23
+ adc x7,x7,xzr
+
+ adds x12,x12,x6
+ adcs x13,x13,x7
+ adc x19,x19,xzr // upmost overflow bit
+ stp x12,x13,[x22,#-16]
+
+ cbnz x20,.Louter
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x14,[x3],#8 // np[0]
+ subs x21,x5,#8 // j=num-1 and clear borrow
+ mov x1,x0
+.Lsub:
+ sbcs x8,x23,x14 // tp[j]-np[j]
+ ldr x23,[x22],#8
+ sub x21,x21,#8 // j--
+ ldr x14,[x3],#8
+ str x8,[x1],#8 // rp[j]=tp[j]-np[j]
+ cbnz x21,.Lsub
+
+ sbcs x8,x23,x14
+ sbcs x19,x19,xzr // did it borrow?
+ str x8,[x1],#8 // rp[num-1]
+
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x8,[x0],#8 // rp[0]
+ sub x5,x5,#8 // num--
+ nop
+.Lcond_copy:
+ sub x5,x5,#8 // num--
+ csel x14,x23,x8,lo // did it borrow?
+ ldr x23,[x22],#8
+ ldr x8,[x0],#8
+ str xzr,[x22,#-16] // wipe tp
+ str x14,[x0,#-16]
+ cbnz x5,.Lcond_copy
+
+ csel x14,x23,x8,lo
+ str xzr,[x22,#-8] // wipe tp
+ str x14,[x0,#-8]
+
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+.type __bn_sqr8x_mont,%function
+.align 5
+__bn_sqr8x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+ // only from bn_mul_mont which has already signed the return address.
+ cmp x1,x2
+ b.ne __bn_mul4x_mont
+.Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x3,[sp,#96] // offload rp and np
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ ldp x12,x13,[x1,#8*6]
+
+ sub x2,sp,x5,lsl#4
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ mov sp,x2 // alloca
+ sub x27,x5,#8*8
+ b .Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+ sub x27,x27,#8*8
+ stp xzr,xzr,[x2,#8*0]
+ stp xzr,xzr,[x2,#8*2]
+ stp xzr,xzr,[x2,#8*4]
+ stp xzr,xzr,[x2,#8*6]
+.Lsqr8x_zero_start:
+ stp xzr,xzr,[x2,#8*8]
+ stp xzr,xzr,[x2,#8*10]
+ stp xzr,xzr,[x2,#8*12]
+ stp xzr,xzr,[x2,#8*14]
+ add x2,x2,#8*16
+ cbnz x27,.Lsqr8x_zero
+
+ add x3,x1,x5
+ add x1,x1,#8*8
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ mov x23,xzr
+ mov x24,xzr
+ mov x25,xzr
+ mov x26,xzr
+ mov x2,sp
+ str x4,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+.Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
+ mul x15,x8,x6
+ mul x16,x9,x6
+ mul x17,x10,x6
+ adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
+ mul x14,x11,x6
+ adcs x21,x21,x15
+ mul x15,x12,x6
+ adcs x22,x22,x16
+ mul x16,x13,x6
+ adcs x23,x23,x17
+ umulh x17,x7,x6 // hi(a[1..7]*a[0])
+ adcs x24,x24,x14
+ umulh x14,x8,x6
+ adcs x25,x25,x15
+ umulh x15,x9,x6
+ adcs x26,x26,x16
+ umulh x16,x10,x6
+ stp x19,x20,[x2],#8*2 // t[0..1]
+ adc x19,xzr,xzr // t[8]
+ adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
+ umulh x17,x11,x6
+ adcs x22,x22,x14
+ umulh x14,x12,x6
+ adcs x23,x23,x15
+ umulh x15,x13,x6
+ adcs x24,x24,x16
+ mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
+ adcs x25,x25,x17
+ mul x17,x9,x7
+ adcs x26,x26,x14
+ mul x14,x10,x7
+ adc x19,x19,x15
+
+ mul x15,x11,x7
+ adds x22,x22,x16
+ mul x16,x12,x7
+ adcs x23,x23,x17
+ mul x17,x13,x7
+ adcs x24,x24,x14
+ umulh x14,x8,x7 // hi(a[2..7]*a[1])
+ adcs x25,x25,x15
+ umulh x15,x9,x7
+ adcs x26,x26,x16
+ umulh x16,x10,x7
+ adcs x19,x19,x17
+ umulh x17,x11,x7
+ stp x21,x22,[x2],#8*2 // t[2..3]
+ adc x20,xzr,xzr // t[9]
+ adds x23,x23,x14
+ umulh x14,x12,x7
+ adcs x24,x24,x15
+ umulh x15,x13,x7
+ adcs x25,x25,x16
+ mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
+ adcs x26,x26,x17
+ mul x17,x10,x8
+ adcs x19,x19,x14
+ mul x14,x11,x8
+ adc x20,x20,x15
+
+ mul x15,x12,x8
+ adds x24,x24,x16
+ mul x16,x13,x8
+ adcs x25,x25,x17
+ umulh x17,x9,x8 // hi(a[3..7]*a[2])
+ adcs x26,x26,x14
+ umulh x14,x10,x8
+ adcs x19,x19,x15
+ umulh x15,x11,x8
+ adcs x20,x20,x16
+ umulh x16,x12,x8
+ stp x23,x24,[x2],#8*2 // t[4..5]
+ adc x21,xzr,xzr // t[10]
+ adds x25,x25,x17
+ umulh x17,x13,x8
+ adcs x26,x26,x14
+ mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
+ adcs x19,x19,x15
+ mul x15,x11,x9
+ adcs x20,x20,x16
+ mul x16,x12,x9
+ adc x21,x21,x17
+
+ mul x17,x13,x9
+ adds x26,x26,x14
+ umulh x14,x10,x9 // hi(a[4..7]*a[3])
+ adcs x19,x19,x15
+ umulh x15,x11,x9
+ adcs x20,x20,x16
+ umulh x16,x12,x9
+ adcs x21,x21,x17
+ umulh x17,x13,x9
+ stp x25,x26,[x2],#8*2 // t[6..7]
+ adc x22,xzr,xzr // t[11]
+ adds x19,x19,x14
+ mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
+ adcs x20,x20,x15
+ mul x15,x12,x10
+ adcs x21,x21,x16
+ mul x16,x13,x10
+ adc x22,x22,x17
+
+ umulh x17,x11,x10 // hi(a[5..7]*a[4])
+ adds x20,x20,x14
+ umulh x14,x12,x10
+ adcs x21,x21,x15
+ umulh x15,x13,x10
+ adcs x22,x22,x16
+ mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
+ adc x23,xzr,xzr // t[12]
+ adds x21,x21,x17
+ mul x17,x13,x11
+ adcs x22,x22,x14
+ umulh x14,x12,x11 // hi(a[6..7]*a[5])
+ adc x23,x23,x15
+
+ umulh x15,x13,x11
+ adds x22,x22,x16
+ mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
+ adcs x23,x23,x17
+ umulh x17,x13,x12 // hi(a[7]*a[6])
+ adc x24,xzr,xzr // t[13]
+ adds x23,x23,x14
+ sub x27,x3,x1 // done yet?
+ adc x24,x24,x15
+
+ adds x24,x24,x16
+ sub x14,x3,x5 // rewinded ap
+ adc x25,xzr,xzr // t[14]
+ add x25,x25,x17
+
+ cbz x27,.Lsqr8x_outer_break
+
+ mov x4,x6
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x0,x1
+ adcs x26,xzr,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved below
+ mov x27,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+.Lsqr8x_mul:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,.Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp x1,x3 // done yet?
+ b.eq .Lsqr8x_break
+
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ ldr x4,[x0,#-8*8]
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b .Lsqr8x_mul
+
+.align 4
+.Lsqr8x_break:
+ ldp x6,x7,[x0,#8*0]
+ add x1,x0,#8*8
+ ldp x8,x9,[x0,#8*2]
+ sub x14,x3,x1 // is it last iteration?
+ ldp x10,x11,[x0,#8*4]
+ sub x15,x2,x14
+ ldp x12,x13,[x0,#8*6]
+ cbz x14,.Lsqr8x_outer_loop
+
+ stp x19,x20,[x2,#8*0]
+ ldp x19,x20,[x15,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x15,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x15,#8*4]
+ stp x25,x26,[x2,#8*6]
+ mov x2,x15
+ ldp x25,x26,[x15,#8*6]
+ b .Lsqr8x_outer_loop
+
+.align 4
+.Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
+ ldp x15,x16,[sp,#8*1]
+ ldp x11,x13,[x14,#8*2]
+ add x1,x14,#8*4
+ ldp x17,x14,[sp,#8*3]
+
+ stp x19,x20,[x2,#8*0]
+ mul x19,x7,x7
+ stp x21,x22,[x2,#8*2]
+ umulh x7,x7,x7
+ stp x23,x24,[x2,#8*4]
+ mul x8,x9,x9
+ stp x25,x26,[x2,#8*6]
+ mov x2,sp
+ umulh x9,x9,x9
+ adds x20,x7,x15,lsl#1
+ extr x15,x16,x15,#63
+ sub x27,x5,#8*4
+
+.Lsqr4x_shift_n_add:
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ sub x27,x27,#8*4
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ ldp x7,x9,[x1],#8*2
+ umulh x11,x11,x11
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ extr x17,x14,x17,#63
+ stp x19,x20,[x2,#8*0]
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ stp x21,x22,[x2,#8*2]
+ adcs x24,x11,x14
+ ldp x17,x14,[x2,#8*7]
+ extr x15,x16,x15,#63
+ adcs x25,x12,x15
+ extr x16,x17,x16,#63
+ adcs x26,x13,x16
+ ldp x15,x16,[x2,#8*9]
+ mul x6,x7,x7
+ ldp x11,x13,[x1],#8*2
+ umulh x7,x7,x7
+ mul x8,x9,x9
+ umulh x9,x9,x9
+ stp x23,x24,[x2,#8*4]
+ extr x17,x14,x17,#63
+ stp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ adcs x19,x6,x17
+ extr x14,x15,x14,#63
+ adcs x20,x7,x14
+ ldp x17,x14,[x2,#8*3]
+ extr x15,x16,x15,#63
+ cbnz x27,.Lsqr4x_shift_n_add
+ ldp x1,x4,[x29,#104] // pull np and n0
+
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ umulh x11,x11,x11
+ stp x19,x20,[x2,#8*0]
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ stp x21,x22,[x2,#8*2]
+ extr x17,x14,x17,#63
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ ldp x19,x20,[sp,#8*0]
+ adcs x24,x11,x14
+ extr x15,x16,x15,#63
+ ldp x6,x7,[x1,#8*0]
+ adcs x25,x12,x15
+ extr x16,xzr,x16,#63
+ ldp x8,x9,[x1,#8*2]
+ adc x26,x13,x16
+ ldp x10,x11,[x1,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul x28,x4,x19 // t[0]*n0
+ ldp x12,x13,[x1,#8*6]
+ add x3,x1,x5
+ ldp x21,x22,[sp,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[sp,#8*4]
+ stp x25,x26,[x2,#8*6]
+ ldp x25,x26,[sp,#8*6]
+ add x1,x1,#8*8
+ mov x30,xzr // initial top-most carry
+ mov x2,sp
+ mov x27,#8
+
+.Lsqr8x_reduction:
+ // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
+ mul x15,x7,x28
+ sub x27,x27,#1
+ mul x16,x8,x28
+ str x28,[x2],#8 // put aside t[0]*n0 for tail processing
+ mul x17,x9,x28
+ // (*) adds xzr,x19,x14
+ subs xzr,x19,#1 // (*)
+ mul x14,x10,x28
+ adcs x19,x20,x15
+ mul x15,x11,x28
+ adcs x20,x21,x16
+ mul x16,x12,x28
+ adcs x21,x22,x17
+ mul x17,x13,x28
+ adcs x22,x23,x14
+ umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
+ adcs x23,x24,x15
+ umulh x15,x7,x28
+ adcs x24,x25,x16
+ umulh x16,x8,x28
+ adcs x25,x26,x17
+ umulh x17,x9,x28
+ adc x26,xzr,xzr
+ adds x19,x19,x14
+ umulh x14,x10,x28
+ adcs x20,x20,x15
+ umulh x15,x11,x28
+ adcs x21,x21,x16
+ umulh x16,x12,x28
+ adcs x22,x22,x17
+ umulh x17,x13,x28
+ mul x28,x4,x19 // next t[0]*n0
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adc x26,x26,x17
+ cbnz x27,.Lsqr8x_reduction
+
+ ldp x14,x15,[x2,#8*0]
+ ldp x16,x17,[x2,#8*2]
+ mov x0,x2
+ sub x27,x3,x1 // done yet?
+ adds x19,x19,x14
+ adcs x20,x20,x15
+ ldp x14,x15,[x2,#8*4]
+ adcs x21,x21,x16
+ adcs x22,x22,x17
+ ldp x16,x17,[x2,#8*6]
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adcs x26,x26,x17
+ //adc x28,xzr,xzr // moved below
+ cbz x27,.Lsqr8x8_post_condition
+
+ ldr x4,[x2,#-8*8]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ mov x27,#-8*8
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+
+.Lsqr8x_tail:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,.Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp x6,x7,[x2,#8*0]
+ sub x27,x3,x1 // done yet?
+ sub x16,x3,x5 // rewinded np
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ cbz x27,.Lsqr8x_tail_break
+
+ ldr x4,[x0,#-8*8]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b .Lsqr8x_tail
+
+.align 4
+.Lsqr8x_tail_break:
+ ldr x4,[x29,#112] // pull n0
+ add x27,x2,#8*8 // end of current t[num] window
+
+ subs xzr,x30,#1 // "move" top-most carry to carry bit
+ adcs x14,x19,x6
+ adcs x15,x20,x7
+ ldp x19,x20,[x0,#8*0]
+ adcs x21,x21,x8
+ ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
+ adcs x22,x22,x9
+ ldp x8,x9,[x16,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x16,#8*4]
+ adcs x25,x25,x12
+ adcs x26,x26,x13
+ ldp x12,x13,[x16,#8*6]
+ add x1,x16,#8*8
+ adc x30,xzr,xzr // top-most carry
+ mul x28,x4,x19
+ stp x14,x15,[x2,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x0,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x0,#8*4]
+ cmp x27,x29 // did we hit the bottom?
+ stp x25,x26,[x2,#8*6]
+ mov x2,x0 // slide the window
+ ldp x25,x26,[x0,#8*6]
+ mov x27,#8
+ b.ne .Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x0,[x29,#96] // pull rp
+ add x2,x2,#8*8
+ subs x14,x19,x6
+ sbcs x15,x20,x7
+ sub x27,x5,#8*8
+ mov x3,x0 // x0 copy
+
+.Lsqr8x_sub:
+ sbcs x16,x21,x8
+ ldp x6,x7,[x1,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x1,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x10,x11,[x1,#8*4]
+ sbcs x17,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ ldp x19,x20,[x2,#8*0]
+ sub x27,x27,#8*8
+ ldp x21,x22,[x2,#8*2]
+ ldp x23,x24,[x2,#8*4]
+ ldp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ stp x14,x15,[x0,#8*4]
+ sbcs x14,x19,x6
+ stp x16,x17,[x0,#8*6]
+ add x0,x0,#8*8
+ sbcs x15,x20,x7
+ cbnz x27,.Lsqr8x_sub
+
+ sbcs x16,x21,x8
+ mov x2,sp
+ add x1,sp,x5
+ ldp x6,x7,[x3,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x3,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x19,x20,[x1,#8*0]
+ sbcs x17,x26,x13
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp x14,x15,[x0,#8*4]
+ stp x16,x17,[x0,#8*6]
+
+ sub x27,x5,#8*4
+.Lsqr4x_cond_copy:
+ sub x27,x27,#8*4
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ ldp x6,x7,[x3,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x16,x21,x8,lo
+ stp xzr,xzr,[x2,#8*2]
+ add x2,x2,#8*4
+ csel x17,x22,x9,lo
+ ldp x8,x9,[x3,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ stp xzr,xzr,[x1,#8*0]
+ stp xzr,xzr,[x1,#8*2]
+ cbnz x27,.Lsqr4x_cond_copy
+
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ stp xzr,xzr,[x2,#8*2]
+ csel x16,x21,x8,lo
+ csel x17,x22,x9,lo
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+
+ b .Lsqr8x_done
+
+.align 4
+.Lsqr8x8_post_condition:
+ adc x28,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // x19-7,x28 hold result, x6-7 hold modulus
+ subs x6,x19,x6
+ ldr x1,[x29,#96] // pull rp
+ sbcs x7,x20,x7
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x8
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x9
+ stp xzr,xzr,[sp,#8*4]
+ sbcs x10,x23,x10
+ stp xzr,xzr,[sp,#8*6]
+ sbcs x11,x24,x11
+ stp xzr,xzr,[sp,#8*8]
+ sbcs x12,x25,x12
+ stp xzr,xzr,[sp,#8*10]
+ sbcs x13,x26,x13
+ stp xzr,xzr,[sp,#8*12]
+ sbcs x28,x28,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // x6-7 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ csel x10,x23,x10,lo
+ csel x11,x24,x11,lo
+ stp x8,x9,[x1,#8*2]
+ csel x12,x25,x12,lo
+ csel x13,x26,x13,lo
+ stp x10,x11,[x1,#8*4]
+ stp x12,x13,[x1,#8*6]
+
+.Lsqr8x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
+.type __bn_mul4x_mont,%function
+.align 5
+__bn_mul4x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+ // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+ // return address.
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ sub x26,sp,x5,lsl#3
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ sub sp,x26,#8*4 // alloca
+
+ add x10,x2,x5
+ add x27,x1,x5
+ stp x0,x10,[x29,#96] // offload rp and &b[num]
+
+ ldr x24,[x2,#8*0] // b[0]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x28,#0
+ mov x26,sp
+
+.Loop_mul4x_1st_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[0])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[0])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ sub x10,x27,x1
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_1st_reduction
+
+ cbz x10,.Lmul4x4_post_condition
+
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldr x25,[sp] // a[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.Loop_mul4x_1st_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[i])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[i])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ adcs x23,x23,x0
+ umulh x13,x17,x25
+ adc x0,xzr,xzr
+ ldr x25,[sp,x28] // next t[0]*n0
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_1st_tail
+
+ sub x11,x27,x5 // rewinded x1
+ cbz x10,.Lmul4x_proceed
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b .Loop_mul4x_1st_tail
+
+.align 5
+.Lmul4x_proceed:
+ ldr x24,[x2,#8*4]! // *++b
+ adc x30,x0,xzr
+ ldp x6,x7,[x11,#8*0] // a[0..3]
+ sub x3,x3,x5 // rewind np
+ ldp x8,x9,[x11,#8*2]
+ add x1,x11,#8*4
+
+ stp x19,x20,[x26,#8*0] // result!!!
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ stp x21,x22,[x26,#8*2] // result!!!
+ ldp x21,x22,[sp,#8*6]
+
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ mov x26,sp
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+
+.align 4
+.Loop_mul4x_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[4])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_reduction
+
+ adc x0,x0,xzr
+ ldp x10,x11,[x26,#8*4] // t[4..7]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+
+ ldr x25,[sp] // t[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.align 4
+.Loop_mul4x_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[4])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ umulh x13,x17,x25
+ adcs x23,x23,x0
+ ldr x25,[sp,x28] // next a[0]*n0
+ adc x0,xzr,xzr
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_tail
+
+ sub x11,x3,x5 // rewinded np?
+ adc x0,x0,xzr
+ cbz x10,.Loop_mul4x_break
+
+ ldp x10,x11,[x26,#8*4]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b .Loop_mul4x_tail
+
+.align 4
+.Loop_mul4x_break:
+ ldp x12,x13,[x29,#96] // pull rp and &b[num]
+ adds x19,x19,x30
+ add x2,x2,#8*4 // bp++
+ adcs x20,x20,xzr
+ sub x1,x1,x5 // rewind ap
+ adcs x21,x21,xzr
+ stp x19,x20,[x26,#8*0] // result!!!
+ adcs x22,x22,xzr
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ adc x30,x0,xzr
+ stp x21,x22,[x26,#8*2] // result!!!
+ cmp x2,x13 // done yet?
+ ldp x21,x22,[sp,#8*6]
+ ldp x14,x15,[x11,#8*0] // n[0..3]
+ ldp x16,x17,[x11,#8*2]
+ add x3,x11,#8*4
+ b.eq .Lmul4x_post
+
+ ldr x24,[x2]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ adds x1,x1,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x26,sp
+ b .Loop_mul4x_reduction
+
+.align 4
+.Lmul4x_post:
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ mov x0,x12
+ mov x27,x12 // x0 copy
+ subs x10,x19,x14
+ add x26,sp,#8*8
+ sbcs x11,x20,x15
+ sub x28,x5,#8*4
+
+.Lmul4x_sub:
+ sbcs x12,x21,x16
+ ldp x14,x15,[x3,#8*0]
+ sub x28,x28,#8*4
+ ldp x19,x20,[x26,#8*0]
+ sbcs x13,x22,x17
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ ldp x21,x22,[x26,#8*2]
+ add x26,x26,#8*4
+ stp x10,x11,[x0,#8*0]
+ sbcs x10,x19,x14
+ stp x12,x13,[x0,#8*2]
+ add x0,x0,#8*4
+ sbcs x11,x20,x15
+ cbnz x28,.Lmul4x_sub
+
+ sbcs x12,x21,x16
+ mov x26,sp
+ add x1,sp,#8*4
+ ldp x6,x7,[x27,#8*0]
+ sbcs x13,x22,x17
+ stp x10,x11,[x0,#8*0]
+ ldp x8,x9,[x27,#8*2]
+ stp x12,x13,[x0,#8*2]
+ ldp x19,x20,[x1,#8*0]
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+
+ sub x28,x5,#8*4
+.Lmul4x_cond_copy:
+ sub x28,x28,#8*4
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ ldp x6,x7,[x27,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*2]
+ add x26,x26,#8*4
+ csel x13,x22,x9,lo
+ ldp x8,x9,[x27,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+ add x27,x27,#8*4
+ cbnz x28,.Lmul4x_cond_copy
+
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ stp xzr,xzr,[x26,#8*2]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*3]
+ csel x13,x22,x9,lo
+ stp xzr,xzr,[x26,#8*4]
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+
+ b .Lmul4x_done
+
+.align 4
+.Lmul4x4_post_condition:
+ adc x0,x0,xzr
+ ldr x1,[x29,#96] // pull rp
+ // x19-3,x0 hold result, x14-7 hold modulus
+ subs x6,x19,x14
+ ldr x30,[x29,#8] // pull return address
+ sbcs x7,x20,x15
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x16
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x17
+ stp xzr,xzr,[sp,#8*4]
+ sbcs xzr,x0,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*6]
+
+ // x6-3 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ stp x8,x9,[x1,#8*2]
+
+.Lmul4x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size __bn_mul4x_mont,.-__bn_mul4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 4
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/armv8-mont-win.S b/gen/bcm/armv8-mont-win.S
new file mode 100644
index 0000000..dcce02c
--- /dev/null
+++ b/gen/bcm/armv8-mont-win.S
@@ -0,0 +1,1431 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl bn_mul_mont
+
+.def bn_mul_mont
+ .type 32
+.endef
+.align 5
+bn_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ tst x5,#7
+ b.eq __bn_sqr8x_mont
+ tst x5,#3
+ b.eq __bn_mul4x_mont
+Lmul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ ldr x9,[x2],#8 // bp[0]
+ sub x22,sp,x5,lsl#3
+ ldp x7,x8,[x1],#16 // ap[0..1]
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ and x22,x22,#-16 // ABI says so
+ ldp x13,x14,[x3],#16 // np[0..1]
+
+ mul x6,x7,x9 // ap[0]*bp[0]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ mul x10,x8,x9 // ap[1]*bp[0]
+ umulh x11,x8,x9
+
+ mul x15,x6,x4 // "tp[0]"*n0
+ mov sp,x22 // alloca
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6 // discarded
+ // (*) As for removal of first multiplication and addition
+ // instructions. The outcome of first addition is
+ // guaranteed to be zero, which leaves two computationally
+ // significant outcomes: it either carries or not. Then
+ // question is when does it carry? Is there alternative
+ // way to deduce it? If you follow operations, you can
+ // observe that condition for carry is quite simple:
+ // x6 being non-zero. So that carry can be calculated
+ // by adding -1 to x6. That's what next instruction does.
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ adc x13,x13,xzr
+ cbz x21,L1st_skip
+
+L1st:
+ ldr x8,[x1],#8
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ ldr x14,[x3],#8
+ adds x12,x16,x13
+ mul x10,x8,x9 // ap[j]*bp[0]
+ adc x13,x17,xzr
+ umulh x11,x8,x9
+
+ adds x12,x12,x6
+ mul x16,x14,x15 // np[j]*m1
+ adc x13,x13,xzr
+ umulh x17,x14,x15
+ str x12,[x22],#8 // tp[j-1]
+ cbnz x21,L1st
+
+L1st_skip:
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adc x13,x17,xzr
+
+ adds x12,x12,x6
+ sub x20,x5,#8 // i=num-1
+ adcs x13,x13,x7
+
+ adc x19,xzr,xzr // upmost overflow bit
+ stp x12,x13,[x22]
+
+Louter:
+ ldr x9,[x2],#8 // bp[i]
+ ldp x7,x8,[x1],#16
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+
+ mul x6,x7,x9 // ap[0]*bp[i]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ ldp x13,x14,[x3],#16
+ mul x10,x8,x9 // ap[1]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x15,x6,x4
+ sub x20,x20,#8 // i--
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ cbz x21,Linner_skip
+
+Linner:
+ ldr x8,[x1],#8
+ adc x13,x13,xzr
+ ldr x23,[x22],#8 // tp[j]
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ ldr x14,[x3],#8
+ adc x13,x17,xzr
+
+ mul x10,x8,x9 // ap[j]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x16,x14,x15 // np[j]*m1
+ adds x12,x12,x6
+ umulh x17,x14,x15
+ str x12,[x22,#-16] // tp[j-1]
+ cbnz x21,Linner
+
+Linner_skip:
+ ldr x23,[x22],#8 // tp[j]
+ adc x13,x13,xzr
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adcs x13,x17,x19
+ adc x19,xzr,xzr
+
+ adds x6,x6,x23
+ adc x7,x7,xzr
+
+ adds x12,x12,x6
+ adcs x13,x13,x7
+ adc x19,x19,xzr // upmost overflow bit
+ stp x12,x13,[x22,#-16]
+
+ cbnz x20,Louter
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x14,[x3],#8 // np[0]
+ subs x21,x5,#8 // j=num-1 and clear borrow
+ mov x1,x0
+Lsub:
+ sbcs x8,x23,x14 // tp[j]-np[j]
+ ldr x23,[x22],#8
+ sub x21,x21,#8 // j--
+ ldr x14,[x3],#8
+ str x8,[x1],#8 // rp[j]=tp[j]-np[j]
+ cbnz x21,Lsub
+
+ sbcs x8,x23,x14
+ sbcs x19,x19,xzr // did it borrow?
+ str x8,[x1],#8 // rp[num-1]
+
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x8,[x0],#8 // rp[0]
+ sub x5,x5,#8 // num--
+ nop
+Lcond_copy:
+ sub x5,x5,#8 // num--
+ csel x14,x23,x8,lo // did it borrow?
+ ldr x23,[x22],#8
+ ldr x8,[x0],#8
+ str xzr,[x22,#-16] // wipe tp
+ str x14,[x0,#-16]
+ cbnz x5,Lcond_copy
+
+ csel x14,x23,x8,lo
+ str xzr,[x22,#-8] // wipe tp
+ str x14,[x0,#-8]
+
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.def __bn_sqr8x_mont
+ .type 32
+.endef
+.align 5
+__bn_sqr8x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+ // only from bn_mul_mont which has already signed the return address.
+ cmp x1,x2
+ b.ne __bn_mul4x_mont
+Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x3,[sp,#96] // offload rp and np
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ ldp x12,x13,[x1,#8*6]
+
+ sub x2,sp,x5,lsl#4
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ mov sp,x2 // alloca
+ sub x27,x5,#8*8
+ b Lsqr8x_zero_start
+
+Lsqr8x_zero:
+ sub x27,x27,#8*8
+ stp xzr,xzr,[x2,#8*0]
+ stp xzr,xzr,[x2,#8*2]
+ stp xzr,xzr,[x2,#8*4]
+ stp xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+ stp xzr,xzr,[x2,#8*8]
+ stp xzr,xzr,[x2,#8*10]
+ stp xzr,xzr,[x2,#8*12]
+ stp xzr,xzr,[x2,#8*14]
+ add x2,x2,#8*16
+ cbnz x27,Lsqr8x_zero
+
+ add x3,x1,x5
+ add x1,x1,#8*8
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ mov x23,xzr
+ mov x24,xzr
+ mov x25,xzr
+ mov x26,xzr
+ mov x2,sp
+ str x4,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
+ mul x15,x8,x6
+ mul x16,x9,x6
+ mul x17,x10,x6
+ adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
+ mul x14,x11,x6
+ adcs x21,x21,x15
+ mul x15,x12,x6
+ adcs x22,x22,x16
+ mul x16,x13,x6
+ adcs x23,x23,x17
+ umulh x17,x7,x6 // hi(a[1..7]*a[0])
+ adcs x24,x24,x14
+ umulh x14,x8,x6
+ adcs x25,x25,x15
+ umulh x15,x9,x6
+ adcs x26,x26,x16
+ umulh x16,x10,x6
+ stp x19,x20,[x2],#8*2 // t[0..1]
+ adc x19,xzr,xzr // t[8]
+ adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
+ umulh x17,x11,x6
+ adcs x22,x22,x14
+ umulh x14,x12,x6
+ adcs x23,x23,x15
+ umulh x15,x13,x6
+ adcs x24,x24,x16
+ mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
+ adcs x25,x25,x17
+ mul x17,x9,x7
+ adcs x26,x26,x14
+ mul x14,x10,x7
+ adc x19,x19,x15
+
+ mul x15,x11,x7
+ adds x22,x22,x16
+ mul x16,x12,x7
+ adcs x23,x23,x17
+ mul x17,x13,x7
+ adcs x24,x24,x14
+ umulh x14,x8,x7 // hi(a[2..7]*a[1])
+ adcs x25,x25,x15
+ umulh x15,x9,x7
+ adcs x26,x26,x16
+ umulh x16,x10,x7
+ adcs x19,x19,x17
+ umulh x17,x11,x7
+ stp x21,x22,[x2],#8*2 // t[2..3]
+ adc x20,xzr,xzr // t[9]
+ adds x23,x23,x14
+ umulh x14,x12,x7
+ adcs x24,x24,x15
+ umulh x15,x13,x7
+ adcs x25,x25,x16
+ mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
+ adcs x26,x26,x17
+ mul x17,x10,x8
+ adcs x19,x19,x14
+ mul x14,x11,x8
+ adc x20,x20,x15
+
+ mul x15,x12,x8
+ adds x24,x24,x16
+ mul x16,x13,x8
+ adcs x25,x25,x17
+ umulh x17,x9,x8 // hi(a[3..7]*a[2])
+ adcs x26,x26,x14
+ umulh x14,x10,x8
+ adcs x19,x19,x15
+ umulh x15,x11,x8
+ adcs x20,x20,x16
+ umulh x16,x12,x8
+ stp x23,x24,[x2],#8*2 // t[4..5]
+ adc x21,xzr,xzr // t[10]
+ adds x25,x25,x17
+ umulh x17,x13,x8
+ adcs x26,x26,x14
+ mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
+ adcs x19,x19,x15
+ mul x15,x11,x9
+ adcs x20,x20,x16
+ mul x16,x12,x9
+ adc x21,x21,x17
+
+ mul x17,x13,x9
+ adds x26,x26,x14
+ umulh x14,x10,x9 // hi(a[4..7]*a[3])
+ adcs x19,x19,x15
+ umulh x15,x11,x9
+ adcs x20,x20,x16
+ umulh x16,x12,x9
+ adcs x21,x21,x17
+ umulh x17,x13,x9
+ stp x25,x26,[x2],#8*2 // t[6..7]
+ adc x22,xzr,xzr // t[11]
+ adds x19,x19,x14
+ mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
+ adcs x20,x20,x15
+ mul x15,x12,x10
+ adcs x21,x21,x16
+ mul x16,x13,x10
+ adc x22,x22,x17
+
+ umulh x17,x11,x10 // hi(a[5..7]*a[4])
+ adds x20,x20,x14
+ umulh x14,x12,x10
+ adcs x21,x21,x15
+ umulh x15,x13,x10
+ adcs x22,x22,x16
+ mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
+ adc x23,xzr,xzr // t[12]
+ adds x21,x21,x17
+ mul x17,x13,x11
+ adcs x22,x22,x14
+ umulh x14,x12,x11 // hi(a[6..7]*a[5])
+ adc x23,x23,x15
+
+ umulh x15,x13,x11
+ adds x22,x22,x16
+ mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
+ adcs x23,x23,x17
+ umulh x17,x13,x12 // hi(a[7]*a[6])
+ adc x24,xzr,xzr // t[13]
+ adds x23,x23,x14
+ sub x27,x3,x1 // done yet?
+ adc x24,x24,x15
+
+ adds x24,x24,x16
+ sub x14,x3,x5 // rewinded ap
+ adc x25,xzr,xzr // t[14]
+ add x25,x25,x17
+
+ cbz x27,Lsqr8x_outer_break
+
+ mov x4,x6
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x0,x1
+ adcs x26,xzr,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved below
+ mov x27,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+Lsqr8x_mul:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp x1,x3 // done yet?
+ b.eq Lsqr8x_break
+
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ ldr x4,[x0,#-8*8]
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b Lsqr8x_mul
+
+.align 4
+Lsqr8x_break:
+ ldp x6,x7,[x0,#8*0]
+ add x1,x0,#8*8
+ ldp x8,x9,[x0,#8*2]
+ sub x14,x3,x1 // is it last iteration?
+ ldp x10,x11,[x0,#8*4]
+ sub x15,x2,x14
+ ldp x12,x13,[x0,#8*6]
+ cbz x14,Lsqr8x_outer_loop
+
+ stp x19,x20,[x2,#8*0]
+ ldp x19,x20,[x15,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x15,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x15,#8*4]
+ stp x25,x26,[x2,#8*6]
+ mov x2,x15
+ ldp x25,x26,[x15,#8*6]
+ b Lsqr8x_outer_loop
+
+.align 4
+Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
+ ldp x15,x16,[sp,#8*1]
+ ldp x11,x13,[x14,#8*2]
+ add x1,x14,#8*4
+ ldp x17,x14,[sp,#8*3]
+
+ stp x19,x20,[x2,#8*0]
+ mul x19,x7,x7
+ stp x21,x22,[x2,#8*2]
+ umulh x7,x7,x7
+ stp x23,x24,[x2,#8*4]
+ mul x8,x9,x9
+ stp x25,x26,[x2,#8*6]
+ mov x2,sp
+ umulh x9,x9,x9
+ adds x20,x7,x15,lsl#1
+ extr x15,x16,x15,#63
+ sub x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ sub x27,x27,#8*4
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ ldp x7,x9,[x1],#8*2
+ umulh x11,x11,x11
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ extr x17,x14,x17,#63
+ stp x19,x20,[x2,#8*0]
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ stp x21,x22,[x2,#8*2]
+ adcs x24,x11,x14
+ ldp x17,x14,[x2,#8*7]
+ extr x15,x16,x15,#63
+ adcs x25,x12,x15
+ extr x16,x17,x16,#63
+ adcs x26,x13,x16
+ ldp x15,x16,[x2,#8*9]
+ mul x6,x7,x7
+ ldp x11,x13,[x1],#8*2
+ umulh x7,x7,x7
+ mul x8,x9,x9
+ umulh x9,x9,x9
+ stp x23,x24,[x2,#8*4]
+ extr x17,x14,x17,#63
+ stp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ adcs x19,x6,x17
+ extr x14,x15,x14,#63
+ adcs x20,x7,x14
+ ldp x17,x14,[x2,#8*3]
+ extr x15,x16,x15,#63
+ cbnz x27,Lsqr4x_shift_n_add
+ ldp x1,x4,[x29,#104] // pull np and n0
+
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ umulh x11,x11,x11
+ stp x19,x20,[x2,#8*0]
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ stp x21,x22,[x2,#8*2]
+ extr x17,x14,x17,#63
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ ldp x19,x20,[sp,#8*0]
+ adcs x24,x11,x14
+ extr x15,x16,x15,#63
+ ldp x6,x7,[x1,#8*0]
+ adcs x25,x12,x15
+ extr x16,xzr,x16,#63
+ ldp x8,x9,[x1,#8*2]
+ adc x26,x13,x16
+ ldp x10,x11,[x1,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul x28,x4,x19 // t[0]*n0
+ ldp x12,x13,[x1,#8*6]
+ add x3,x1,x5
+ ldp x21,x22,[sp,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[sp,#8*4]
+ stp x25,x26,[x2,#8*6]
+ ldp x25,x26,[sp,#8*6]
+ add x1,x1,#8*8
+ mov x30,xzr // initial top-most carry
+ mov x2,sp
+ mov x27,#8
+
+Lsqr8x_reduction:
+ // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
+ mul x15,x7,x28
+ sub x27,x27,#1
+ mul x16,x8,x28
+ str x28,[x2],#8 // put aside t[0]*n0 for tail processing
+ mul x17,x9,x28
+ // (*) adds xzr,x19,x14
+ subs xzr,x19,#1 // (*)
+ mul x14,x10,x28
+ adcs x19,x20,x15
+ mul x15,x11,x28
+ adcs x20,x21,x16
+ mul x16,x12,x28
+ adcs x21,x22,x17
+ mul x17,x13,x28
+ adcs x22,x23,x14
+ umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
+ adcs x23,x24,x15
+ umulh x15,x7,x28
+ adcs x24,x25,x16
+ umulh x16,x8,x28
+ adcs x25,x26,x17
+ umulh x17,x9,x28
+ adc x26,xzr,xzr
+ adds x19,x19,x14
+ umulh x14,x10,x28
+ adcs x20,x20,x15
+ umulh x15,x11,x28
+ adcs x21,x21,x16
+ umulh x16,x12,x28
+ adcs x22,x22,x17
+ umulh x17,x13,x28
+ mul x28,x4,x19 // next t[0]*n0
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adc x26,x26,x17
+ cbnz x27,Lsqr8x_reduction
+
+ ldp x14,x15,[x2,#8*0]
+ ldp x16,x17,[x2,#8*2]
+ mov x0,x2
+ sub x27,x3,x1 // done yet?
+ adds x19,x19,x14
+ adcs x20,x20,x15
+ ldp x14,x15,[x2,#8*4]
+ adcs x21,x21,x16
+ adcs x22,x22,x17
+ ldp x16,x17,[x2,#8*6]
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adcs x26,x26,x17
+ //adc x28,xzr,xzr // moved below
+ cbz x27,Lsqr8x8_post_condition
+
+ ldr x4,[x2,#-8*8]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ mov x27,#-8*8
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+
+Lsqr8x_tail:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp x6,x7,[x2,#8*0]
+ sub x27,x3,x1 // done yet?
+ sub x16,x3,x5 // rewinded np
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ cbz x27,Lsqr8x_tail_break
+
+ ldr x4,[x0,#-8*8]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b Lsqr8x_tail
+
+.align 4
+Lsqr8x_tail_break:
+ ldr x4,[x29,#112] // pull n0
+ add x27,x2,#8*8 // end of current t[num] window
+
+ subs xzr,x30,#1 // "move" top-most carry to carry bit
+ adcs x14,x19,x6
+ adcs x15,x20,x7
+ ldp x19,x20,[x0,#8*0]
+ adcs x21,x21,x8
+ ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
+ adcs x22,x22,x9
+ ldp x8,x9,[x16,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x16,#8*4]
+ adcs x25,x25,x12
+ adcs x26,x26,x13
+ ldp x12,x13,[x16,#8*6]
+ add x1,x16,#8*8
+ adc x30,xzr,xzr // top-most carry
+ mul x28,x4,x19
+ stp x14,x15,[x2,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x0,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x0,#8*4]
+ cmp x27,x29 // did we hit the bottom?
+ stp x25,x26,[x2,#8*6]
+ mov x2,x0 // slide the window
+ ldp x25,x26,[x0,#8*6]
+ mov x27,#8
+ b.ne Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x0,[x29,#96] // pull rp
+ add x2,x2,#8*8
+ subs x14,x19,x6
+ sbcs x15,x20,x7
+ sub x27,x5,#8*8
+ mov x3,x0 // x0 copy
+
+Lsqr8x_sub:
+ sbcs x16,x21,x8
+ ldp x6,x7,[x1,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x1,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x10,x11,[x1,#8*4]
+ sbcs x17,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ ldp x19,x20,[x2,#8*0]
+ sub x27,x27,#8*8
+ ldp x21,x22,[x2,#8*2]
+ ldp x23,x24,[x2,#8*4]
+ ldp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ stp x14,x15,[x0,#8*4]
+ sbcs x14,x19,x6
+ stp x16,x17,[x0,#8*6]
+ add x0,x0,#8*8
+ sbcs x15,x20,x7
+ cbnz x27,Lsqr8x_sub
+
+ sbcs x16,x21,x8
+ mov x2,sp
+ add x1,sp,x5
+ ldp x6,x7,[x3,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x3,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x19,x20,[x1,#8*0]
+ sbcs x17,x26,x13
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp x14,x15,[x0,#8*4]
+ stp x16,x17,[x0,#8*6]
+
+ sub x27,x5,#8*4
+Lsqr4x_cond_copy:
+ sub x27,x27,#8*4
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ ldp x6,x7,[x3,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x16,x21,x8,lo
+ stp xzr,xzr,[x2,#8*2]
+ add x2,x2,#8*4
+ csel x17,x22,x9,lo
+ ldp x8,x9,[x3,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ stp xzr,xzr,[x1,#8*0]
+ stp xzr,xzr,[x1,#8*2]
+ cbnz x27,Lsqr4x_cond_copy
+
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ stp xzr,xzr,[x2,#8*2]
+ csel x16,x21,x8,lo
+ csel x17,x22,x9,lo
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+
+ b Lsqr8x_done
+
+.align 4
+Lsqr8x8_post_condition:
+ adc x28,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // x19-7,x28 hold result, x6-7 hold modulus
+ subs x6,x19,x6
+ ldr x1,[x29,#96] // pull rp
+ sbcs x7,x20,x7
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x8
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x9
+ stp xzr,xzr,[sp,#8*4]
+ sbcs x10,x23,x10
+ stp xzr,xzr,[sp,#8*6]
+ sbcs x11,x24,x11
+ stp xzr,xzr,[sp,#8*8]
+ sbcs x12,x25,x12
+ stp xzr,xzr,[sp,#8*10]
+ sbcs x13,x26,x13
+ stp xzr,xzr,[sp,#8*12]
+ sbcs x28,x28,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // x6-7 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ csel x10,x23,x10,lo
+ csel x11,x24,x11,lo
+ stp x8,x9,[x1,#8*2]
+ csel x12,x25,x12,lo
+ csel x13,x26,x13,lo
+ stp x10,x11,[x1,#8*4]
+ stp x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.def __bn_mul4x_mont
+ .type 32
+.endef
+.align 5
+__bn_mul4x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+ // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+ // return address.
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ sub x26,sp,x5,lsl#3
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ sub sp,x26,#8*4 // alloca
+
+ add x10,x2,x5
+ add x27,x1,x5
+ stp x0,x10,[x29,#96] // offload rp and &b[num]
+
+ ldr x24,[x2,#8*0] // b[0]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x28,#0
+ mov x26,sp
+
+Loop_mul4x_1st_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[0])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[0])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ sub x10,x27,x1
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_1st_reduction
+
+ cbz x10,Lmul4x4_post_condition
+
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldr x25,[sp] // a[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[i])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[i])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ adcs x23,x23,x0
+ umulh x13,x17,x25
+ adc x0,xzr,xzr
+ ldr x25,[sp,x28] // next t[0]*n0
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_1st_tail
+
+ sub x11,x27,x5 // rewinded x1
+ cbz x10,Lmul4x_proceed
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b Loop_mul4x_1st_tail
+
+.align 5
+Lmul4x_proceed:
+ ldr x24,[x2,#8*4]! // *++b
+ adc x30,x0,xzr
+ ldp x6,x7,[x11,#8*0] // a[0..3]
+ sub x3,x3,x5 // rewind np
+ ldp x8,x9,[x11,#8*2]
+ add x1,x11,#8*4
+
+ stp x19,x20,[x26,#8*0] // result!!!
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ stp x21,x22,[x26,#8*2] // result!!!
+ ldp x21,x22,[sp,#8*6]
+
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ mov x26,sp
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+
+.align 4
+Loop_mul4x_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[4])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_reduction
+
+ adc x0,x0,xzr
+ ldp x10,x11,[x26,#8*4] // t[4..7]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+
+ ldr x25,[sp] // t[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.align 4
+Loop_mul4x_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[4])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ umulh x13,x17,x25
+ adcs x23,x23,x0
+ ldr x25,[sp,x28] // next a[0]*n0
+ adc x0,xzr,xzr
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_tail
+
+ sub x11,x3,x5 // rewinded np?
+ adc x0,x0,xzr
+ cbz x10,Loop_mul4x_break
+
+ ldp x10,x11,[x26,#8*4]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b Loop_mul4x_tail
+
+.align 4
+Loop_mul4x_break:
+ ldp x12,x13,[x29,#96] // pull rp and &b[num]
+ adds x19,x19,x30
+ add x2,x2,#8*4 // bp++
+ adcs x20,x20,xzr
+ sub x1,x1,x5 // rewind ap
+ adcs x21,x21,xzr
+ stp x19,x20,[x26,#8*0] // result!!!
+ adcs x22,x22,xzr
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ adc x30,x0,xzr
+ stp x21,x22,[x26,#8*2] // result!!!
+ cmp x2,x13 // done yet?
+ ldp x21,x22,[sp,#8*6]
+ ldp x14,x15,[x11,#8*0] // n[0..3]
+ ldp x16,x17,[x11,#8*2]
+ add x3,x11,#8*4
+ b.eq Lmul4x_post
+
+ ldr x24,[x2]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ adds x1,x1,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x26,sp
+ b Loop_mul4x_reduction
+
+.align 4
+Lmul4x_post:
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ mov x0,x12
+ mov x27,x12 // x0 copy
+ subs x10,x19,x14
+ add x26,sp,#8*8
+ sbcs x11,x20,x15
+ sub x28,x5,#8*4
+
+Lmul4x_sub:
+ sbcs x12,x21,x16
+ ldp x14,x15,[x3,#8*0]
+ sub x28,x28,#8*4
+ ldp x19,x20,[x26,#8*0]
+ sbcs x13,x22,x17
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ ldp x21,x22,[x26,#8*2]
+ add x26,x26,#8*4
+ stp x10,x11,[x0,#8*0]
+ sbcs x10,x19,x14
+ stp x12,x13,[x0,#8*2]
+ add x0,x0,#8*4
+ sbcs x11,x20,x15
+ cbnz x28,Lmul4x_sub
+
+ sbcs x12,x21,x16
+ mov x26,sp
+ add x1,sp,#8*4
+ ldp x6,x7,[x27,#8*0]
+ sbcs x13,x22,x17
+ stp x10,x11,[x0,#8*0]
+ ldp x8,x9,[x27,#8*2]
+ stp x12,x13,[x0,#8*2]
+ ldp x19,x20,[x1,#8*0]
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+
+ sub x28,x5,#8*4
+Lmul4x_cond_copy:
+ sub x28,x28,#8*4
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ ldp x6,x7,[x27,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*2]
+ add x26,x26,#8*4
+ csel x13,x22,x9,lo
+ ldp x8,x9,[x27,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+ add x27,x27,#8*4
+ cbnz x28,Lmul4x_cond_copy
+
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ stp xzr,xzr,[x26,#8*2]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*3]
+ csel x13,x22,x9,lo
+ stp xzr,xzr,[x26,#8*4]
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+
+ b Lmul4x_done
+
+.align 4
+Lmul4x4_post_condition:
+ adc x0,x0,xzr
+ ldr x1,[x29,#96] // pull rp
+ // x19-3,x0 hold result, x14-7 hold modulus
+ subs x6,x19,x14
+ ldr x30,[x29,#8] // pull return address
+ sbcs x7,x20,x15
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x16
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x17
+ stp xzr,xzr,[sp,#8*4]
+ sbcs xzr,x0,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*6]
+
+ // x6-3 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ stp x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 4
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/bn-586-apple.S b/gen/bcm/bn-586-apple.S
new file mode 100644
index 0000000..93513d0
--- /dev/null
+++ b/gen/bcm/bn-586-apple.S
@@ -0,0 +1,987 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _bn_mul_add_words
+.private_extern _bn_mul_add_words
+.align 4
+_bn_mul_add_words:
+L_bn_mul_add_words_begin:
+ call L000PIC_me_up
+L000PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp L002maw_sse2_entry
+.align 4,0x90
+L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz L004maw_sse2_exit
+L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz L003maw_sse2_unrolled
+.align 2,0x90
+L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz L005maw_sse2_loop
+L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 4,0x90
+L001maw_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 28(%esp),%ecx
+ movl 24(%esp),%ebx
+ andl $4294967288,%ecx
+ movl 32(%esp),%ebp
+ pushl %ecx
+ jz L006maw_finish
+.align 4,0x90
+L007maw_loop:
+ # Round 0
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ # Round 4
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ # Round 8
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ # Round 12
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ # Round 16
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ # Round 20
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ # Round 24
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+ # Round 28
+ movl 28(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 28(%edi),%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ subl $8,%ecx
+ leal 32(%ebx),%ebx
+ leal 32(%edi),%edi
+ jnz L007maw_loop
+L006maw_finish:
+ movl 32(%esp),%ecx
+ andl $7,%ecx
+ jnz L008maw_finish2
+ jmp L009maw_end
+L008maw_finish2:
+ # Tail Round 0
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ jz L009maw_end
+ # Tail Round 1
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ jz L009maw_end
+ # Tail Round 2
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ jz L009maw_end
+ # Tail Round 3
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ jz L009maw_end
+ # Tail Round 4
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ jz L009maw_end
+ # Tail Round 5
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ jz L009maw_end
+ # Tail Round 6
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+L009maw_end:
+ movl %esi,%eax
+ popl %ecx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _bn_mul_words
+.private_extern _bn_mul_words
+.align 4
+_bn_mul_words:
+L_bn_mul_words_begin:
+ call L010PIC_me_up
+L010PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 4,0x90
+L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 4,0x90
+L011mw_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ebp
+ movl 32(%esp),%ecx
+ andl $4294967288,%ebp
+ jz L013mw_finish
+L014mw_loop:
+ # Round 0
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ # Round 4
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ # Round 8
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ # Round 12
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ # Round 16
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ # Round 20
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ # Round 24
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+ # Round 28
+ movl 28(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ addl $32,%ebx
+ addl $32,%edi
+ subl $8,%ebp
+ jz L013mw_finish
+ jmp L014mw_loop
+L013mw_finish:
+ movl 28(%esp),%ebp
+ andl $7,%ebp
+ jnz L015mw_finish2
+ jmp L016mw_end
+L015mw_finish2:
+ # Tail Round 0
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz L016mw_end
+ # Tail Round 1
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz L016mw_end
+ # Tail Round 2
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz L016mw_end
+ # Tail Round 3
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz L016mw_end
+ # Tail Round 4
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz L016mw_end
+ # Tail Round 5
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz L016mw_end
+ # Tail Round 6
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+L016mw_end:
+ movl %esi,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _bn_sqr_words
+.private_extern _bn_sqr_words
+.align 4
+_bn_sqr_words:
+L_bn_sqr_words_begin:
+ call L017PIC_me_up
+L017PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L018sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 4,0x90
+L019sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz L019sqr_sse2_loop
+ emms
+ ret
+.align 4,0x90
+L018sqr_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%ebx
+ andl $4294967288,%ebx
+ jz L020sw_finish
+L021sw_loop:
+ # Round 0
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ movl %edx,4(%esi)
+ # Round 4
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ movl %edx,12(%esi)
+ # Round 8
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ movl %edx,20(%esi)
+ # Round 12
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ movl %edx,28(%esi)
+ # Round 16
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ movl %edx,36(%esi)
+ # Round 20
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ movl %edx,44(%esi)
+ # Round 24
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+ # Round 28
+ movl 28(%edi),%eax
+ mull %eax
+ movl %eax,56(%esi)
+ movl %edx,60(%esi)
+
+ addl $32,%edi
+ addl $64,%esi
+ subl $8,%ebx
+ jnz L021sw_loop
+L020sw_finish:
+ movl 28(%esp),%ebx
+ andl $7,%ebx
+ jz L022sw_end
+ # Tail Round 0
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ decl %ebx
+ movl %edx,4(%esi)
+ jz L022sw_end
+ # Tail Round 1
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ decl %ebx
+ movl %edx,12(%esi)
+ jz L022sw_end
+ # Tail Round 2
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ decl %ebx
+ movl %edx,20(%esi)
+ jz L022sw_end
+ # Tail Round 3
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ decl %ebx
+ movl %edx,28(%esi)
+ jz L022sw_end
+ # Tail Round 4
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ decl %ebx
+ movl %edx,36(%esi)
+ jz L022sw_end
+ # Tail Round 5
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ decl %ebx
+ movl %edx,44(%esi)
+ jz L022sw_end
+ # Tail Round 6
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+L022sw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _bn_div_words
+.private_extern _bn_div_words
+.align 4
+_bn_div_words:
+L_bn_div_words_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ movl 12(%esp),%ecx
+ divl %ecx
+ ret
+.globl _bn_add_words
+.private_extern _bn_add_words
+.align 4
+_bn_add_words:
+L_bn_add_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz L023aw_finish
+L024aw_loop:
+ # Round 0
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ # Round 1
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+ # Round 2
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+ # Round 3
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+ # Round 4
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+ # Round 5
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+ # Round 6
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+ # Round 7
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz L024aw_loop
+L023aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz L025aw_end
+ # Tail Round 0
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz L025aw_end
+ # Tail Round 1
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz L025aw_end
+ # Tail Round 2
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz L025aw_end
+ # Tail Round 3
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz L025aw_end
+ # Tail Round 4
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz L025aw_end
+ # Tail Round 5
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz L025aw_end
+ # Tail Round 6
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+L025aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _bn_sub_words
+.private_extern _bn_sub_words
+.align 4
+_bn_sub_words:
+L_bn_sub_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz L026aw_finish
+L027aw_loop:
+ # Round 0
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ # Round 1
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+ # Round 2
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+ # Round 3
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+ # Round 4
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+ # Round 5
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+ # Round 6
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+ # Round 7
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz L027aw_loop
+L026aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz L028aw_end
+ # Tail Round 0
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz L028aw_end
+ # Tail Round 1
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz L028aw_end
+ # Tail Round 2
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz L028aw_end
+ # Tail Round 3
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz L028aw_end
+ # Tail Round 4
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz L028aw_end
+ # Tail Round 5
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz L028aw_end
+ # Tail Round 6
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+L028aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/bn-586-linux.S b/gen/bcm/bn-586-linux.S
new file mode 100644
index 0000000..311f22c
--- /dev/null
+++ b/gen/bcm/bn-586-linux.S
@@ -0,0 +1,995 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl bn_mul_add_words
+.hidden bn_mul_add_words
+.type bn_mul_add_words,@function
+.align 16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp .L002maw_sse2_entry
+.align 16
+.L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz .L004maw_sse2_exit
+.L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz .L003maw_sse2_unrolled
+.align 4
+.L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L005maw_sse2_loop
+.L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L001maw_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 28(%esp),%ecx
+ movl 24(%esp),%ebx
+ andl $4294967288,%ecx
+ movl 32(%esp),%ebp
+ pushl %ecx
+ jz .L006maw_finish
+.align 16
+.L007maw_loop:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 28(%edi),%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ subl $8,%ecx
+ leal 32(%ebx),%ebx
+ leal 32(%edi),%edi
+ jnz .L007maw_loop
+.L006maw_finish:
+ movl 32(%esp),%ecx
+ andl $7,%ecx
+ jnz .L008maw_finish2
+ jmp .L009maw_end
+.L008maw_finish2:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L009maw_end:
+ movl %esi,%eax
+ popl %ecx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl bn_mul_words
+.hidden bn_mul_words
+.type bn_mul_words,@function
+.align 16
+bn_mul_words:
+.L_bn_mul_words_begin:
+ call .L010PIC_me_up
+.L010PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 16
+.L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L011mw_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ebp
+ movl 32(%esp),%ecx
+ andl $4294967288,%ebp
+ jz .L013mw_finish
+.L014mw_loop:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ addl $32,%ebx
+ addl $32,%edi
+ subl $8,%ebp
+ jz .L013mw_finish
+ jmp .L014mw_loop
+.L013mw_finish:
+ movl 28(%esp),%ebp
+ andl $7,%ebp
+ jnz .L015mw_finish2
+ jmp .L016mw_end
+.L015mw_finish2:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L016mw_end:
+ movl %esi,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_words,.-.L_bn_mul_words_begin
+.globl bn_sqr_words
+.hidden bn_sqr_words
+.type bn_sqr_words,@function
+.align 16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+ call .L017PIC_me_up
+.L017PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L018sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 16
+.L019sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz .L019sqr_sse2_loop
+ emms
+ ret
+.align 16
+.L018sqr_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%ebx
+ andl $4294967288,%ebx
+ jz .L020sw_finish
+.L021sw_loop:
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ movl %edx,4(%esi)
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ movl %edx,12(%esi)
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ movl %edx,20(%esi)
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ movl %edx,28(%esi)
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ movl %edx,36(%esi)
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ movl %edx,44(%esi)
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+
+ movl 28(%edi),%eax
+ mull %eax
+ movl %eax,56(%esi)
+ movl %edx,60(%esi)
+
+ addl $32,%edi
+ addl $64,%esi
+ subl $8,%ebx
+ jnz .L021sw_loop
+.L020sw_finish:
+ movl 28(%esp),%ebx
+ andl $7,%ebx
+ jz .L022sw_end
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ decl %ebx
+ movl %edx,4(%esi)
+ jz .L022sw_end
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ decl %ebx
+ movl %edx,12(%esi)
+ jz .L022sw_end
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ decl %ebx
+ movl %edx,20(%esi)
+ jz .L022sw_end
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ decl %ebx
+ movl %edx,28(%esi)
+ jz .L022sw_end
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ decl %ebx
+ movl %edx,36(%esi)
+ jz .L022sw_end
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ decl %ebx
+ movl %edx,44(%esi)
+ jz .L022sw_end
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+.L022sw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl bn_div_words
+.hidden bn_div_words
+.type bn_div_words,@function
+.align 16
+bn_div_words:
+.L_bn_div_words_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ movl 12(%esp),%ecx
+ divl %ecx
+ ret
+.size bn_div_words,.-.L_bn_div_words_begin
+.globl bn_add_words
+.hidden bn_add_words
+.type bn_add_words,@function
+.align 16
+bn_add_words:
+.L_bn_add_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L023aw_finish
+.L024aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L024aw_loop
+.L023aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L025aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L025aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L025aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L025aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L025aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L025aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L025aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L025aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_add_words,.-.L_bn_add_words_begin
+.globl bn_sub_words
+.hidden bn_sub_words
+.type bn_sub_words,@function
+.align 16
+bn_sub_words:
+.L_bn_sub_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L026aw_finish
+.L027aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L027aw_loop
+.L026aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L028aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L028aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L028aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L028aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L028aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L028aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L028aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L028aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sub_words,.-.L_bn_sub_words_begin
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/bn-586-win.asm b/gen/bcm/bn-586-win.asm
new file mode 100644
index 0000000..f7ddfa8
--- /dev/null
+++ b/gen/bcm/bn-586-win.asm
@@ -0,0 +1,982 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+;extern _OPENSSL_ia32cap_P
+global _bn_mul_add_words
+align 16
+_bn_mul_add_words:
+L$_bn_mul_add_words_begin:
+ lea eax,[_OPENSSL_ia32cap_P]
+ bt DWORD [eax],26
+ jnc NEAR L$000maw_non_sse2
+ mov eax,DWORD [4+esp]
+ mov edx,DWORD [8+esp]
+ mov ecx,DWORD [12+esp]
+ movd mm0,DWORD [16+esp]
+ pxor mm1,mm1
+ jmp NEAR L$001maw_sse2_entry
+align 16
+L$002maw_sse2_unrolled:
+ movd mm3,DWORD [eax]
+ paddq mm1,mm3
+ movd mm2,DWORD [edx]
+ pmuludq mm2,mm0
+ movd mm4,DWORD [4+edx]
+ pmuludq mm4,mm0
+ movd mm6,DWORD [8+edx]
+ pmuludq mm6,mm0
+ movd mm7,DWORD [12+edx]
+ pmuludq mm7,mm0
+ paddq mm1,mm2
+ movd mm3,DWORD [4+eax]
+ paddq mm3,mm4
+ movd mm5,DWORD [8+eax]
+ paddq mm5,mm6
+ movd mm4,DWORD [12+eax]
+ paddq mm7,mm4
+ movd DWORD [eax],mm1
+ movd mm2,DWORD [16+edx]
+ pmuludq mm2,mm0
+ psrlq mm1,32
+ movd mm4,DWORD [20+edx]
+ pmuludq mm4,mm0
+ paddq mm1,mm3
+ movd mm6,DWORD [24+edx]
+ pmuludq mm6,mm0
+ movd DWORD [4+eax],mm1
+ psrlq mm1,32
+ movd mm3,DWORD [28+edx]
+ add edx,32
+ pmuludq mm3,mm0
+ paddq mm1,mm5
+ movd mm5,DWORD [16+eax]
+ paddq mm2,mm5
+ movd DWORD [8+eax],mm1
+ psrlq mm1,32
+ paddq mm1,mm7
+ movd mm5,DWORD [20+eax]
+ paddq mm4,mm5
+ movd DWORD [12+eax],mm1
+ psrlq mm1,32
+ paddq mm1,mm2
+ movd mm5,DWORD [24+eax]
+ paddq mm6,mm5
+ movd DWORD [16+eax],mm1
+ psrlq mm1,32
+ paddq mm1,mm4
+ movd mm5,DWORD [28+eax]
+ paddq mm3,mm5
+ movd DWORD [20+eax],mm1
+ psrlq mm1,32
+ paddq mm1,mm6
+ movd DWORD [24+eax],mm1
+ psrlq mm1,32
+ paddq mm1,mm3
+ movd DWORD [28+eax],mm1
+ lea eax,[32+eax]
+ psrlq mm1,32
+ sub ecx,8
+ jz NEAR L$003maw_sse2_exit
+L$001maw_sse2_entry:
+ test ecx,4294967288
+ jnz NEAR L$002maw_sse2_unrolled
+align 4
+L$004maw_sse2_loop:
+ movd mm2,DWORD [edx]
+ movd mm3,DWORD [eax]
+ pmuludq mm2,mm0
+ lea edx,[4+edx]
+ paddq mm1,mm3
+ paddq mm1,mm2
+ movd DWORD [eax],mm1
+ sub ecx,1
+ psrlq mm1,32
+ lea eax,[4+eax]
+ jnz NEAR L$004maw_sse2_loop
+L$003maw_sse2_exit:
+ movd eax,mm1
+ emms
+ ret
+align 16
+L$000maw_non_sse2:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ ;
+ xor esi,esi
+ mov edi,DWORD [20+esp]
+ mov ecx,DWORD [28+esp]
+ mov ebx,DWORD [24+esp]
+ and ecx,4294967288
+ mov ebp,DWORD [32+esp]
+ push ecx
+ jz NEAR L$005maw_finish
+align 16
+L$006maw_loop:
+ ; Round 0
+ mov eax,DWORD [ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [edi]
+ adc edx,0
+ mov DWORD [edi],eax
+ mov esi,edx
+ ; Round 4
+ mov eax,DWORD [4+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [4+edi]
+ adc edx,0
+ mov DWORD [4+edi],eax
+ mov esi,edx
+ ; Round 8
+ mov eax,DWORD [8+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [8+edi]
+ adc edx,0
+ mov DWORD [8+edi],eax
+ mov esi,edx
+ ; Round 12
+ mov eax,DWORD [12+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [12+edi]
+ adc edx,0
+ mov DWORD [12+edi],eax
+ mov esi,edx
+ ; Round 16
+ mov eax,DWORD [16+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [16+edi]
+ adc edx,0
+ mov DWORD [16+edi],eax
+ mov esi,edx
+ ; Round 20
+ mov eax,DWORD [20+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [20+edi]
+ adc edx,0
+ mov DWORD [20+edi],eax
+ mov esi,edx
+ ; Round 24
+ mov eax,DWORD [24+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [24+edi]
+ adc edx,0
+ mov DWORD [24+edi],eax
+ mov esi,edx
+ ; Round 28
+ mov eax,DWORD [28+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [28+edi]
+ adc edx,0
+ mov DWORD [28+edi],eax
+ mov esi,edx
+ ;
+ sub ecx,8
+ lea ebx,[32+ebx]
+ lea edi,[32+edi]
+ jnz NEAR L$006maw_loop
+L$005maw_finish:
+ mov ecx,DWORD [32+esp]
+ and ecx,7
+ jnz NEAR L$007maw_finish2
+ jmp NEAR L$008maw_end
+L$007maw_finish2:
+ ; Tail Round 0
+ mov eax,DWORD [ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [edi]
+ adc edx,0
+ dec ecx
+ mov DWORD [edi],eax
+ mov esi,edx
+ jz NEAR L$008maw_end
+ ; Tail Round 1
+ mov eax,DWORD [4+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [4+edi]
+ adc edx,0
+ dec ecx
+ mov DWORD [4+edi],eax
+ mov esi,edx
+ jz NEAR L$008maw_end
+ ; Tail Round 2
+ mov eax,DWORD [8+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [8+edi]
+ adc edx,0
+ dec ecx
+ mov DWORD [8+edi],eax
+ mov esi,edx
+ jz NEAR L$008maw_end
+ ; Tail Round 3
+ mov eax,DWORD [12+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [12+edi]
+ adc edx,0
+ dec ecx
+ mov DWORD [12+edi],eax
+ mov esi,edx
+ jz NEAR L$008maw_end
+ ; Tail Round 4
+ mov eax,DWORD [16+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [16+edi]
+ adc edx,0
+ dec ecx
+ mov DWORD [16+edi],eax
+ mov esi,edx
+ jz NEAR L$008maw_end
+ ; Tail Round 5
+ mov eax,DWORD [20+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [20+edi]
+ adc edx,0
+ dec ecx
+ mov DWORD [20+edi],eax
+ mov esi,edx
+ jz NEAR L$008maw_end
+ ; Tail Round 6
+ mov eax,DWORD [24+ebx]
+ mul ebp
+ add eax,esi
+ adc edx,0
+ add eax,DWORD [24+edi]
+ adc edx,0
+ mov DWORD [24+edi],eax
+ mov esi,edx
+L$008maw_end:
+ mov eax,esi
+ pop ecx
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _bn_mul_words
+align 16
+_bn_mul_words:
+L$_bn_mul_words_begin:
+ lea eax,[_OPENSSL_ia32cap_P]
+ bt DWORD [eax],26
+ jnc NEAR L$009mw_non_sse2
+ mov eax,DWORD [4+esp]
+ mov edx,DWORD [8+esp]
+ mov ecx,DWORD [12+esp]
+ movd mm0,DWORD [16+esp]
+ pxor mm1,mm1
+align 16
+L$010mw_sse2_loop:
+ movd mm2,DWORD [edx]
+ pmuludq mm2,mm0
+ lea edx,[4+edx]
+ paddq mm1,mm2
+ movd DWORD [eax],mm1
+ sub ecx,1
+ psrlq mm1,32
+ lea eax,[4+eax]
+ jnz NEAR L$010mw_sse2_loop
+ movd eax,mm1
+ emms
+ ret
+align 16
+L$009mw_non_sse2:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ ;
+ xor esi,esi
+ mov edi,DWORD [20+esp]
+ mov ebx,DWORD [24+esp]
+ mov ebp,DWORD [28+esp]
+ mov ecx,DWORD [32+esp]
+ and ebp,4294967288
+ jz NEAR L$011mw_finish
+L$012mw_loop:
+ ; Round 0
+ mov eax,DWORD [ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [edi],eax
+ mov esi,edx
+ ; Round 4
+ mov eax,DWORD [4+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [4+edi],eax
+ mov esi,edx
+ ; Round 8
+ mov eax,DWORD [8+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [8+edi],eax
+ mov esi,edx
+ ; Round 12
+ mov eax,DWORD [12+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [12+edi],eax
+ mov esi,edx
+ ; Round 16
+ mov eax,DWORD [16+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [16+edi],eax
+ mov esi,edx
+ ; Round 20
+ mov eax,DWORD [20+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [20+edi],eax
+ mov esi,edx
+ ; Round 24
+ mov eax,DWORD [24+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [24+edi],eax
+ mov esi,edx
+ ; Round 28
+ mov eax,DWORD [28+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [28+edi],eax
+ mov esi,edx
+ ;
+ add ebx,32
+ add edi,32
+ sub ebp,8
+ jz NEAR L$011mw_finish
+ jmp NEAR L$012mw_loop
+L$011mw_finish:
+ mov ebp,DWORD [28+esp]
+ and ebp,7
+ jnz NEAR L$013mw_finish2
+ jmp NEAR L$014mw_end
+L$013mw_finish2:
+ ; Tail Round 0
+ mov eax,DWORD [ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [edi],eax
+ mov esi,edx
+ dec ebp
+ jz NEAR L$014mw_end
+ ; Tail Round 1
+ mov eax,DWORD [4+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [4+edi],eax
+ mov esi,edx
+ dec ebp
+ jz NEAR L$014mw_end
+ ; Tail Round 2
+ mov eax,DWORD [8+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [8+edi],eax
+ mov esi,edx
+ dec ebp
+ jz NEAR L$014mw_end
+ ; Tail Round 3
+ mov eax,DWORD [12+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [12+edi],eax
+ mov esi,edx
+ dec ebp
+ jz NEAR L$014mw_end
+ ; Tail Round 4
+ mov eax,DWORD [16+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [16+edi],eax
+ mov esi,edx
+ dec ebp
+ jz NEAR L$014mw_end
+ ; Tail Round 5
+ mov eax,DWORD [20+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [20+edi],eax
+ mov esi,edx
+ dec ebp
+ jz NEAR L$014mw_end
+ ; Tail Round 6
+ mov eax,DWORD [24+ebx]
+ mul ecx
+ add eax,esi
+ adc edx,0
+ mov DWORD [24+edi],eax
+ mov esi,edx
+L$014mw_end:
+ mov eax,esi
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _bn_sqr_words
+align 16
+_bn_sqr_words:
+L$_bn_sqr_words_begin:
+ lea eax,[_OPENSSL_ia32cap_P]
+ bt DWORD [eax],26
+ jnc NEAR L$015sqr_non_sse2
+ mov eax,DWORD [4+esp]
+ mov edx,DWORD [8+esp]
+ mov ecx,DWORD [12+esp]
+align 16
+L$016sqr_sse2_loop:
+ movd mm0,DWORD [edx]
+ pmuludq mm0,mm0
+ lea edx,[4+edx]
+ movq [eax],mm0
+ sub ecx,1
+ lea eax,[8+eax]
+ jnz NEAR L$016sqr_sse2_loop
+ emms
+ ret
+align 16
+L$015sqr_non_sse2:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ ;
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov ebx,DWORD [28+esp]
+ and ebx,4294967288
+ jz NEAR L$017sw_finish
+L$018sw_loop:
+ ; Round 0
+ mov eax,DWORD [edi]
+ mul eax
+ mov DWORD [esi],eax
+ mov DWORD [4+esi],edx
+ ; Round 4
+ mov eax,DWORD [4+edi]
+ mul eax
+ mov DWORD [8+esi],eax
+ mov DWORD [12+esi],edx
+ ; Round 8
+ mov eax,DWORD [8+edi]
+ mul eax
+ mov DWORD [16+esi],eax
+ mov DWORD [20+esi],edx
+ ; Round 12
+ mov eax,DWORD [12+edi]
+ mul eax
+ mov DWORD [24+esi],eax
+ mov DWORD [28+esi],edx
+ ; Round 16
+ mov eax,DWORD [16+edi]
+ mul eax
+ mov DWORD [32+esi],eax
+ mov DWORD [36+esi],edx
+ ; Round 20
+ mov eax,DWORD [20+edi]
+ mul eax
+ mov DWORD [40+esi],eax
+ mov DWORD [44+esi],edx
+ ; Round 24
+ mov eax,DWORD [24+edi]
+ mul eax
+ mov DWORD [48+esi],eax
+ mov DWORD [52+esi],edx
+ ; Round 28
+ mov eax,DWORD [28+edi]
+ mul eax
+ mov DWORD [56+esi],eax
+ mov DWORD [60+esi],edx
+ ;
+ add edi,32
+ add esi,64
+ sub ebx,8
+ jnz NEAR L$018sw_loop
+L$017sw_finish:
+ mov ebx,DWORD [28+esp]
+ and ebx,7
+ jz NEAR L$019sw_end
+ ; Tail Round 0
+ mov eax,DWORD [edi]
+ mul eax
+ mov DWORD [esi],eax
+ dec ebx
+ mov DWORD [4+esi],edx
+ jz NEAR L$019sw_end
+ ; Tail Round 1
+ mov eax,DWORD [4+edi]
+ mul eax
+ mov DWORD [8+esi],eax
+ dec ebx
+ mov DWORD [12+esi],edx
+ jz NEAR L$019sw_end
+ ; Tail Round 2
+ mov eax,DWORD [8+edi]
+ mul eax
+ mov DWORD [16+esi],eax
+ dec ebx
+ mov DWORD [20+esi],edx
+ jz NEAR L$019sw_end
+ ; Tail Round 3
+ mov eax,DWORD [12+edi]
+ mul eax
+ mov DWORD [24+esi],eax
+ dec ebx
+ mov DWORD [28+esi],edx
+ jz NEAR L$019sw_end
+ ; Tail Round 4
+ mov eax,DWORD [16+edi]
+ mul eax
+ mov DWORD [32+esi],eax
+ dec ebx
+ mov DWORD [36+esi],edx
+ jz NEAR L$019sw_end
+ ; Tail Round 5
+ mov eax,DWORD [20+edi]
+ mul eax
+ mov DWORD [40+esi],eax
+ dec ebx
+ mov DWORD [44+esi],edx
+ jz NEAR L$019sw_end
+ ; Tail Round 6
+ mov eax,DWORD [24+edi]
+ mul eax
+ mov DWORD [48+esi],eax
+ mov DWORD [52+esi],edx
+L$019sw_end:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _bn_div_words
+align 16
+_bn_div_words:
+L$_bn_div_words_begin:
+ mov edx,DWORD [4+esp]
+ mov eax,DWORD [8+esp]
+ mov ecx,DWORD [12+esp]
+ div ecx
+ ret
+global _bn_add_words
+align 16
+_bn_add_words:
+L$_bn_add_words_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ ;
+ mov ebx,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ mov edi,DWORD [28+esp]
+ mov ebp,DWORD [32+esp]
+ xor eax,eax
+ and ebp,4294967288
+ jz NEAR L$020aw_finish
+L$021aw_loop:
+ ; Round 0
+ mov ecx,DWORD [esi]
+ mov edx,DWORD [edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [ebx],ecx
+ ; Round 1
+ mov ecx,DWORD [4+esi]
+ mov edx,DWORD [4+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [4+ebx],ecx
+ ; Round 2
+ mov ecx,DWORD [8+esi]
+ mov edx,DWORD [8+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [8+ebx],ecx
+ ; Round 3
+ mov ecx,DWORD [12+esi]
+ mov edx,DWORD [12+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [12+ebx],ecx
+ ; Round 4
+ mov ecx,DWORD [16+esi]
+ mov edx,DWORD [16+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [16+ebx],ecx
+ ; Round 5
+ mov ecx,DWORD [20+esi]
+ mov edx,DWORD [20+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [20+ebx],ecx
+ ; Round 6
+ mov ecx,DWORD [24+esi]
+ mov edx,DWORD [24+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [24+ebx],ecx
+ ; Round 7
+ mov ecx,DWORD [28+esi]
+ mov edx,DWORD [28+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [28+ebx],ecx
+ ;
+ add esi,32
+ add edi,32
+ add ebx,32
+ sub ebp,8
+ jnz NEAR L$021aw_loop
+L$020aw_finish:
+ mov ebp,DWORD [32+esp]
+ and ebp,7
+ jz NEAR L$022aw_end
+ ; Tail Round 0
+ mov ecx,DWORD [esi]
+ mov edx,DWORD [edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [ebx],ecx
+ jz NEAR L$022aw_end
+ ; Tail Round 1
+ mov ecx,DWORD [4+esi]
+ mov edx,DWORD [4+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [4+ebx],ecx
+ jz NEAR L$022aw_end
+ ; Tail Round 2
+ mov ecx,DWORD [8+esi]
+ mov edx,DWORD [8+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [8+ebx],ecx
+ jz NEAR L$022aw_end
+ ; Tail Round 3
+ mov ecx,DWORD [12+esi]
+ mov edx,DWORD [12+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [12+ebx],ecx
+ jz NEAR L$022aw_end
+ ; Tail Round 4
+ mov ecx,DWORD [16+esi]
+ mov edx,DWORD [16+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [16+ebx],ecx
+ jz NEAR L$022aw_end
+ ; Tail Round 5
+ mov ecx,DWORD [20+esi]
+ mov edx,DWORD [20+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [20+ebx],ecx
+ jz NEAR L$022aw_end
+ ; Tail Round 6
+ mov ecx,DWORD [24+esi]
+ mov edx,DWORD [24+edi]
+ add ecx,eax
+ mov eax,0
+ adc eax,eax
+ add ecx,edx
+ adc eax,0
+ mov DWORD [24+ebx],ecx
+L$022aw_end:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _bn_sub_words
+align 16
+_bn_sub_words:
+L$_bn_sub_words_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ ;
+ mov ebx,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ mov edi,DWORD [28+esp]
+ mov ebp,DWORD [32+esp]
+ xor eax,eax
+ and ebp,4294967288
+ jz NEAR L$023aw_finish
+L$024aw_loop:
+ ; Round 0
+ mov ecx,DWORD [esi]
+ mov edx,DWORD [edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [ebx],ecx
+ ; Round 1
+ mov ecx,DWORD [4+esi]
+ mov edx,DWORD [4+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [4+ebx],ecx
+ ; Round 2
+ mov ecx,DWORD [8+esi]
+ mov edx,DWORD [8+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [8+ebx],ecx
+ ; Round 3
+ mov ecx,DWORD [12+esi]
+ mov edx,DWORD [12+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [12+ebx],ecx
+ ; Round 4
+ mov ecx,DWORD [16+esi]
+ mov edx,DWORD [16+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [16+ebx],ecx
+ ; Round 5
+ mov ecx,DWORD [20+esi]
+ mov edx,DWORD [20+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [20+ebx],ecx
+ ; Round 6
+ mov ecx,DWORD [24+esi]
+ mov edx,DWORD [24+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [24+ebx],ecx
+ ; Round 7
+ mov ecx,DWORD [28+esi]
+ mov edx,DWORD [28+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [28+ebx],ecx
+ ;
+ add esi,32
+ add edi,32
+ add ebx,32
+ sub ebp,8
+ jnz NEAR L$024aw_loop
+L$023aw_finish:
+ mov ebp,DWORD [32+esp]
+ and ebp,7
+ jz NEAR L$025aw_end
+ ; Tail Round 0
+ mov ecx,DWORD [esi]
+ mov edx,DWORD [edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [ebx],ecx
+ jz NEAR L$025aw_end
+ ; Tail Round 1
+ mov ecx,DWORD [4+esi]
+ mov edx,DWORD [4+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [4+ebx],ecx
+ jz NEAR L$025aw_end
+ ; Tail Round 2
+ mov ecx,DWORD [8+esi]
+ mov edx,DWORD [8+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [8+ebx],ecx
+ jz NEAR L$025aw_end
+ ; Tail Round 3
+ mov ecx,DWORD [12+esi]
+ mov edx,DWORD [12+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [12+ebx],ecx
+ jz NEAR L$025aw_end
+ ; Tail Round 4
+ mov ecx,DWORD [16+esi]
+ mov edx,DWORD [16+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [16+ebx],ecx
+ jz NEAR L$025aw_end
+ ; Tail Round 5
+ mov ecx,DWORD [20+esi]
+ mov edx,DWORD [20+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ dec ebp
+ mov DWORD [20+ebx],ecx
+ jz NEAR L$025aw_end
+ ; Tail Round 6
+ mov ecx,DWORD [24+esi]
+ mov edx,DWORD [24+edi]
+ sub ecx,eax
+ mov eax,0
+ adc eax,eax
+ sub ecx,edx
+ adc eax,0
+ mov DWORD [24+ebx],ecx
+L$025aw_end:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+segment .bss
+common _OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/bn-armv8-apple.S b/gen/bcm/bn-armv8-apple.S
new file mode 100644
index 0000000..5e3471a
--- /dev/null
+++ b/gen/bcm/bn-armv8-apple.S
@@ -0,0 +1,89 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+// size_t num);
+
+.globl _bn_add_words
+.private_extern _bn_add_words
+.align 4
+_bn_add_words:
+ AARCH64_VALID_CALL_TARGET
+ # Clear the carry flag.
+ cmn xzr, xzr
+
+ # aarch64 can load two registers at a time, so we do two loop iterations at
+ # at a time. Split x3 = 2 * x8 + x3. This allows loop
+ # operations to use CBNZ without clobbering the carry flag.
+ lsr x8, x3, #1
+ and x3, x3, #1
+
+ cbz x8, Ladd_tail
+Ladd_loop:
+ ldp x4, x5, [x1], #16
+ ldp x6, x7, [x2], #16
+ sub x8, x8, #1
+ adcs x4, x4, x6
+ adcs x5, x5, x7
+ stp x4, x5, [x0], #16
+ cbnz x8, Ladd_loop
+
+Ladd_tail:
+ cbz x3, Ladd_exit
+ ldr x4, [x1], #8
+ ldr x6, [x2], #8
+ adcs x4, x4, x6
+ str x4, [x0], #8
+
+Ladd_exit:
+ cset x0, cs
+ ret
+
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+// size_t num);
+
+.globl _bn_sub_words
+.private_extern _bn_sub_words
+.align 4
+_bn_sub_words:
+ AARCH64_VALID_CALL_TARGET
+ # Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+ # so we want C = 1 here.
+ cmp xzr, xzr
+
+ # aarch64 can load two registers at a time, so we do two loop iterations at
+ # at a time. Split x3 = 2 * x8 + x3. This allows loop
+ # operations to use CBNZ without clobbering the carry flag.
+ lsr x8, x3, #1
+ and x3, x3, #1
+
+ cbz x8, Lsub_tail
+Lsub_loop:
+ ldp x4, x5, [x1], #16
+ ldp x6, x7, [x2], #16
+ sub x8, x8, #1
+ sbcs x4, x4, x6
+ sbcs x5, x5, x7
+ stp x4, x5, [x0], #16
+ cbnz x8, Lsub_loop
+
+Lsub_tail:
+ cbz x3, Lsub_exit
+ ldr x4, [x1], #8
+ ldr x6, [x2], #8
+ sbcs x4, x4, x6
+ str x4, [x0], #8
+
+Lsub_exit:
+ cset x0, cc
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/bn-armv8-linux.S b/gen/bcm/bn-armv8-linux.S
new file mode 100644
index 0000000..2b8823a
--- /dev/null
+++ b/gen/bcm/bn-armv8-linux.S
@@ -0,0 +1,89 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+// size_t num);
+.type bn_add_words, %function
+.globl bn_add_words
+.hidden bn_add_words
+.align 4
+bn_add_words:
+ AARCH64_VALID_CALL_TARGET
+ # Clear the carry flag.
+ cmn xzr, xzr
+
+ # aarch64 can load two registers at a time, so we do two loop iterations at
+ # at a time. Split x3 = 2 * x8 + x3. This allows loop
+ # operations to use CBNZ without clobbering the carry flag.
+ lsr x8, x3, #1
+ and x3, x3, #1
+
+ cbz x8, .Ladd_tail
+.Ladd_loop:
+ ldp x4, x5, [x1], #16
+ ldp x6, x7, [x2], #16
+ sub x8, x8, #1
+ adcs x4, x4, x6
+ adcs x5, x5, x7
+ stp x4, x5, [x0], #16
+ cbnz x8, .Ladd_loop
+
+.Ladd_tail:
+ cbz x3, .Ladd_exit
+ ldr x4, [x1], #8
+ ldr x6, [x2], #8
+ adcs x4, x4, x6
+ str x4, [x0], #8
+
+.Ladd_exit:
+ cset x0, cs
+ ret
+.size bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+// size_t num);
+.type bn_sub_words, %function
+.globl bn_sub_words
+.hidden bn_sub_words
+.align 4
+bn_sub_words:
+ AARCH64_VALID_CALL_TARGET
+ # Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+ # so we want C = 1 here.
+ cmp xzr, xzr
+
+ # aarch64 can load two registers at a time, so we do two loop iterations at
+ # at a time. Split x3 = 2 * x8 + x3. This allows loop
+ # operations to use CBNZ without clobbering the carry flag.
+ lsr x8, x3, #1
+ and x3, x3, #1
+
+ cbz x8, .Lsub_tail
+.Lsub_loop:
+ ldp x4, x5, [x1], #16
+ ldp x6, x7, [x2], #16
+ sub x8, x8, #1
+ sbcs x4, x4, x6
+ sbcs x5, x5, x7
+ stp x4, x5, [x0], #16
+ cbnz x8, .Lsub_loop
+
+.Lsub_tail:
+ cbz x3, .Lsub_exit
+ ldr x4, [x1], #8
+ ldr x6, [x2], #8
+ sbcs x4, x4, x6
+ str x4, [x0], #8
+
+.Lsub_exit:
+ cset x0, cc
+ ret
+.size bn_sub_words,.-bn_sub_words
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/bn-armv8-win.S b/gen/bcm/bn-armv8-win.S
new file mode 100644
index 0000000..af97080
--- /dev/null
+++ b/gen/bcm/bn-armv8-win.S
@@ -0,0 +1,89 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+// size_t num);
+
+.globl bn_add_words
+
+.align 4
+bn_add_words:
+ AARCH64_VALID_CALL_TARGET
+ # Clear the carry flag.
+ cmn xzr, xzr
+
+ # aarch64 can load two registers at a time, so we do two loop iterations at
+ # at a time. Split x3 = 2 * x8 + x3. This allows loop
+ # operations to use CBNZ without clobbering the carry flag.
+ lsr x8, x3, #1
+ and x3, x3, #1
+
+ cbz x8, Ladd_tail
+Ladd_loop:
+ ldp x4, x5, [x1], #16
+ ldp x6, x7, [x2], #16
+ sub x8, x8, #1
+ adcs x4, x4, x6
+ adcs x5, x5, x7
+ stp x4, x5, [x0], #16
+ cbnz x8, Ladd_loop
+
+Ladd_tail:
+ cbz x3, Ladd_exit
+ ldr x4, [x1], #8
+ ldr x6, [x2], #8
+ adcs x4, x4, x6
+ str x4, [x0], #8
+
+Ladd_exit:
+ cset x0, cs
+ ret
+
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+// size_t num);
+
+.globl bn_sub_words
+
+.align 4
+bn_sub_words:
+ AARCH64_VALID_CALL_TARGET
+ # Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+ # so we want C = 1 here.
+ cmp xzr, xzr
+
+ # aarch64 can load two registers at a time, so we do two loop iterations at
+ # at a time. Split x3 = 2 * x8 + x3. This allows loop
+ # operations to use CBNZ without clobbering the carry flag.
+ lsr x8, x3, #1
+ and x3, x3, #1
+
+ cbz x8, Lsub_tail
+Lsub_loop:
+ ldp x4, x5, [x1], #16
+ ldp x6, x7, [x2], #16
+ sub x8, x8, #1
+ sbcs x4, x4, x6
+ sbcs x5, x5, x7
+ stp x4, x5, [x0], #16
+ cbnz x8, Lsub_loop
+
+Lsub_tail:
+ cbz x3, Lsub_exit
+ ldr x4, [x1], #8
+ ldr x6, [x2], #8
+ sbcs x4, x4, x6
+ str x4, [x0], #8
+
+Lsub_exit:
+ cset x0, cc
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/bsaes-armv7-linux.S b/gen/bcm/bsaes-armv7-linux.S
new file mode 100644
index 0000000..01a9ead
--- /dev/null
+++ b/gen/bcm/bsaes-armv7-linux.S
@@ -0,0 +1,1517 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ of Linaro. Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt 19.5 cycles per byte processed with 128-bit key
+@ decrypt 22.1 cycles per byte processed with 128-bit key
+@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@ <appro@openssl.org>
+
+@ April-August 2013
+@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code 32
+# undef __thumb2__
+#endif
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr r6,.
+ vldmia r4!, {q9} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,.LM0ISR
+#else
+ add r6,r6,#.LM0ISR-_bsaes_decrypt8
+#endif
+
+ vldmia r6!, {q8} @ .LM0ISR
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ vldmia r4!, {q8,q9,q10,q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Ldec_sbox:
+ veor q1, q1, q4
+ veor q3, q3, q4
+
+ veor q4, q4, q7
+ veor q1, q1, q6
+ veor q2, q2, q7
+ veor q6, q6, q4
+
+ veor q0, q0, q1
+ veor q2, q2, q5
+ veor q7, q7, q6
+ veor q3, q3, q0
+ veor q5, q5, q0
+ veor q1, q1, q3
+ veor q11, q3, q0
+ veor q10, q7, q4
+ veor q9, q1, q6
+ veor q13, q4, q0
+ vmov q8, q10
+ veor q12, q5, q2
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q6, q2
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q3, q7
+ veor q12, q1, q5
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q4, q6
+ veor q9, q9, q14
+ vand q13, q0, q2
+ vand q14, q7, q1
+ vorr q15, q3, q5
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q5, q2
+ veor q8, q1, q6
+ veor q10, q15, q14
+ vand q10, q10, q5
+ veor q5, q5, q1
+ vand q11, q1, q15
+ vand q5, q5, q14
+ veor q1, q11, q10
+ veor q5, q5, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q2
+ veor q12, q12, q8
+ veor q2, q2, q6
+ vand q8, q8, q15
+ vand q6, q6, q13
+ vand q12, q12, q14
+ vand q2, q2, q9
+ veor q8, q8, q12
+ veor q2, q2, q6
+ veor q12, q12, q11
+ veor q6, q6, q10
+ veor q5, q5, q12
+ veor q2, q2, q12
+ veor q1, q1, q8
+ veor q6, q6, q8
+
+ veor q12, q3, q0
+ veor q8, q7, q4
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q4
+ vand q8, q8, q15
+ vand q4, q4, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q4
+ veor q12, q12, q11
+ veor q4, q4, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q3
+ veor q3, q3, q7
+ vand q11, q7, q15
+ vand q3, q3, q14
+ veor q7, q11, q10
+ veor q3, q3, q11
+ veor q3, q3, q12
+ veor q0, q0, q12
+ veor q7, q7, q8
+ veor q4, q4, q8
+ veor q1, q1, q7
+ veor q6, q6, q5
+
+ veor q4, q4, q1
+ veor q2, q2, q7
+ veor q5, q5, q7
+ veor q4, q4, q2
+ veor q7, q7, q0
+ veor q4, q4, q5
+ veor q3, q3, q6
+ veor q6, q6, q1
+ veor q3, q3, q4
+
+ veor q4, q4, q0
+ veor q7, q7, q3
+ subs r5,r5,#1
+ bcc .Ldec_done
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 q8, q0, q0, #8
+ vext.8 q14, q3, q3, #8
+ vext.8 q15, q5, q5, #8
+ veor q8, q8, q0
+ vext.8 q9, q1, q1, #8
+ veor q14, q14, q3
+ vext.8 q10, q6, q6, #8
+ veor q15, q15, q5
+ vext.8 q11, q4, q4, #8
+ veor q9, q9, q1
+ vext.8 q12, q2, q2, #8
+ veor q10, q10, q6
+ vext.8 q13, q7, q7, #8
+ veor q11, q11, q4
+ veor q12, q12, q2
+ veor q13, q13, q7
+
+ veor q0, q0, q14
+ veor q1, q1, q14
+ veor q6, q6, q8
+ veor q2, q2, q10
+ veor q4, q4, q9
+ veor q1, q1, q15
+ veor q6, q6, q15
+ veor q2, q2, q14
+ veor q7, q7, q11
+ veor q4, q4, q14
+ veor q3, q3, q12
+ veor q2, q2, q15
+ veor q7, q7, q15
+ veor q5, q5, q13
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q6, q6, #12
+ veor q1, q1, q9
+ vext.8 q11, q4, q4, #12
+ veor q6, q6, q10
+ vext.8 q12, q2, q2, #12
+ veor q4, q4, q11
+ vext.8 q13, q7, q7, #12
+ veor q2, q2, q12
+ vext.8 q14, q3, q3, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q3, q3, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q2
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q2, q2, #8
+ veor q12, q12, q4
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q3
+ vext.8 q2, q4, q4, #8
+ veor q11, q11, q6
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q4, q3, q3, #8
+ veor q11, q11, q5
+ vext.8 q3, q6, q6, #8
+ veor q5, q9, q13
+ veor q11, q11, q2
+ veor q7, q7, q15
+ veor q6, q4, q14
+ veor q4, q8, q12
+ veor q2, q3, q10
+ vmov q3, q11
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq r6,r6,#0x10
+ bne .Ldec_loop
+ vldmia r6, {q12} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q3, #1
+ vshr.u64 q11, q2, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q4
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q2, #2
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q4
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q4, #4
+ vshr.u64 q11, q6, #4
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q4, q4, q10
+ veor q6, q6, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q6, q6, q8
+ veor q4, q4, q8
+ veor q2, q2, q8
+ veor q7, q7, q8
+ veor q3, q3, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR:@ InvShiftRows constants
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:@ ShiftRows constants
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+.quad 0x090d01050c000408, 0x03070b0f060a0e02
+.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr r6,.
+ vldmia r4!, {q9} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,.LM0SR
+#else
+ sub r6,r6,#_bsaes_encrypt8-.LM0SR
+#endif
+
+ vldmia r6!, {q8} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ vldmia r4!, {q8,q9,q10,q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Lenc_sbox:
+ veor q2, q2, q1
+ veor q5, q5, q6
+ veor q3, q3, q0
+ veor q6, q6, q2
+ veor q5, q5, q0
+
+ veor q6, q6, q3
+ veor q3, q3, q7
+ veor q7, q7, q5
+ veor q3, q3, q4
+ veor q4, q4, q5
+
+ veor q2, q2, q7
+ veor q3, q3, q1
+ veor q1, q1, q5
+ veor q11, q7, q4
+ veor q10, q1, q2
+ veor q9, q5, q3
+ veor q13, q2, q4
+ vmov q8, q10
+ veor q12, q6, q0
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q3, q0
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q7, q1
+ veor q12, q5, q6
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q2, q3
+ veor q9, q9, q14
+ vand q13, q4, q0
+ vand q14, q1, q5
+ vorr q15, q7, q6
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q6, q0
+ veor q8, q5, q3
+ veor q10, q15, q14
+ vand q10, q10, q6
+ veor q6, q6, q5
+ vand q11, q5, q15
+ vand q6, q6, q14
+ veor q5, q11, q10
+ veor q6, q6, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q3
+ vand q8, q8, q15
+ vand q3, q3, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q3
+ veor q12, q12, q11
+ veor q3, q3, q10
+ veor q6, q6, q12
+ veor q0, q0, q12
+ veor q5, q5, q8
+ veor q3, q3, q8
+
+ veor q12, q7, q4
+ veor q8, q1, q2
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q4
+ veor q12, q12, q8
+ veor q4, q4, q2
+ vand q8, q8, q15
+ vand q2, q2, q13
+ vand q12, q12, q14
+ vand q4, q4, q9
+ veor q8, q8, q12
+ veor q4, q4, q2
+ veor q12, q12, q11
+ veor q2, q2, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q7
+ veor q7, q7, q1
+ vand q11, q1, q15
+ vand q7, q7, q14
+ veor q1, q11, q10
+ veor q7, q7, q11
+ veor q7, q7, q12
+ veor q4, q4, q12
+ veor q1, q1, q8
+ veor q2, q2, q8
+ veor q7, q7, q0
+ veor q1, q1, q6
+ veor q6, q6, q0
+ veor q4, q4, q7
+ veor q0, q0, q1
+
+ veor q1, q1, q5
+ veor q5, q5, q2
+ veor q2, q2, q3
+ veor q3, q3, q5
+ veor q4, q4, q5
+
+ veor q6, q6, q3
+ subs r5,r5,#1
+ bcc .Lenc_done
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q4, q4, #12
+ veor q1, q1, q9
+ vext.8 q11, q6, q6, #12
+ veor q4, q4, q10
+ vext.8 q12, q3, q3, #12
+ veor q6, q6, q11
+ vext.8 q13, q7, q7, #12
+ veor q3, q3, q12
+ vext.8 q14, q2, q2, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q2, q2, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q3
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q3, q3, #8
+ veor q12, q12, q6
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q2
+ vext.8 q3, q6, q6, #8
+ veor q11, q11, q4
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q6, q2, q2, #8
+ veor q11, q11, q5
+ vext.8 q2, q4, q4, #8
+ veor q5, q9, q13
+ veor q4, q8, q12
+ veor q3, q3, q11
+ veor q7, q7, q15
+ veor q6, q6, q14
+ @ vmov q4, q8
+ veor q2, q2, q10
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq r6,r6,#0x10
+ bne .Lenc_loop
+ vldmia r6, {q12} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q3, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q4, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q6
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q4, q4, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q3, #2
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q6
+ veor q11, q11, q4
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #2
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q6, #4
+ vshr.u64 q11, q4, #4
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q4, q4, q8
+ veor q6, q6, q8
+ veor q3, q3, q8
+ veor q7, q7, q8
+ veor q2, q2, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr r6,.
+ vld1.8 {q7}, [r4]! @ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,.LM0
+#else
+ sub r6,r6,#_bsaes_key_convert-.LM0
+#endif
+ vld1.8 {q15}, [r4]! @ load round 1 key
+
+ vmov.i8 q8, #0x01 @ bit masks
+ vmov.i8 q9, #0x02
+ vmov.i8 q10, #0x04
+ vmov.i8 q11, #0x08
+ vmov.i8 q12, #0x10
+ vmov.i8 q13, #0x20
+ vldmia r6, {q14} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 q7, q7
+ vrev32.8 q15, q15
+#endif
+ sub r5,r5,#1
+ vstmia r12!, {q7} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 d14,{q15},d28
+ vtbl.8 d15,{q15},d29
+ vmov.i8 q6, #0x40
+ vmov.i8 q15, #0x80
+
+ vtst.8 q0, q7, q8
+ vtst.8 q1, q7, q9
+ vtst.8 q2, q7, q10
+ vtst.8 q3, q7, q11
+ vtst.8 q4, q7, q12
+ vtst.8 q5, q7, q13
+ vtst.8 q6, q7, q6
+ vtst.8 q7, q7, q15
+ vld1.8 {q15}, [r4]! @ load next round key
+ vmvn q0, q0 @ "pnot"
+ vmvn q1, q1
+ vmvn q5, q5
+ vmvn q6, q6
+#ifdef __ARMEL__
+ vrev32.8 q15, q15
+#endif
+ subs r5,r5,#1
+ vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 q7,#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+.globl bsaes_cbc_encrypt
+.hidden bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+ @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+ @ short inputs. We patch this out, using bsaes for all input sizes.
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ IV is 1st arg on the stack
+ mov r2, r2, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ vldmia sp, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia sp, {q7}
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r3, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+
+#endif
+
+ vld1.8 {q15}, [r8] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs r2, r2, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {q0,q1}, [r0]! @ load input
+ vld1.8 {q2,q3}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ vld1.8 {q4,q5}, [r0]!
+ mov r5, r10
+ vld1.8 {q6,q7}, [r0]
+ sub r0, r0, #0x60
+ vstmia r9, {q15} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q14,q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ veor q5, q5, q14
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ vst1.8 {q5}, [r1]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds r2, r2, #8
+ beq .Lcbc_dec_done
+
+ @ Set up most parameters for the _bsaes_decrypt8 call.
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ mov r5, r10
+ vstmia r9, {q15} @ put aside IV
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo .Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
+ beq .Lcbc_dec_two
+ vld1.8 {q2}, [r0]!
+ cmp r2, #4
+ blo .Lcbc_dec_three
+ vld1.8 {q3}, [r0]!
+ beq .Lcbc_dec_four
+ vld1.8 {q4}, [r0]!
+ cmp r2, #6
+ blo .Lcbc_dec_five
+ vld1.8 {q5}, [r0]!
+ beq .Lcbc_dec_six
+ vld1.8 {q6}, [r0]!
+ sub r0, r0, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub r0, r0, #0x60
+ bl _bsaes_decrypt8
+ vldmia r9,{q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub r0, r0, #0x50
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q2, q2, q11
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub r0, r0, #0x40
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub r0, r0, #0x30
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub r0, r0, #0x20
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q1, q1, q8
+ vst1.8 {q0,q1}, [r1]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub r0, r0, #0x10
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vst1.8 {q0}, [r1]! @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero:@ wipe key schedule [if any]
+ vstmia sp!, {q0,q1}
+ cmp sp, r9
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ vst1.8 {q15}, [r8] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.globl bsaes_ctr32_encrypt_blocks
+.hidden bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+ @ out to retain a constant-time implementation.
+ mov ip, sp
+ stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+ vld1.8 {q0}, [r8] @ load counter
+#ifdef __APPLE__
+ mov r8, #:lower16:(.LREVM0SR-.LM0)
+ add r8, r6, r8
+#else
+ add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+#endif
+ vldmia sp, {q4} @ load round0 key
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+.align 2
+ add r12, r3, #248
+ vld1.8 {q0}, [r8] @ load counter
+ adrl r8, .LREVM0SR @ borrow r8
+ vldmia r12, {q4} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 q8,#1 @ compose 1<<96
+ veor q9,q9,q9
+ vrev32.8 q0,q0
+ vext.8 q8,q9,q8,#4
+ vrev32.8 q4,q4
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vstmia sp, {q4} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 q10, q8, q9 @ compose 3<<96
+ vadd.u32 q1, q0, q8 @ +1
+ vadd.u32 q2, q0, q9 @ +2
+ vadd.u32 q3, q0, q10 @ +3
+ vadd.u32 q4, q1, q10
+ vadd.u32 q5, q2, q10
+ vadd.u32 q6, q3, q10
+ vadd.u32 q7, q4, q10
+ vadd.u32 q10, q5, q10 @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia sp, {q9} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x10 @ pass next round key
+#else
+ add r4, r3, #264
+#endif
+ vldmia r8, {q8} @ .LREVM0SR
+ mov r5, r10 @ pass rounds
+ vstmia r9, {q10} @ save next counter
+#ifdef __APPLE__
+ mov r6, #:lower16:(.LREVM0SR-.LSR)
+ sub r6, r8, r6
+#else
+ sub r6, r8, #.LREVM0SR-.LSR @ pass constants
+#endif
+
+ bl _bsaes_encrypt8_alt
+
+ subs r2, r2, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {q8,q9}, [r0]! @ load input
+ vld1.8 {q10,q11}, [r0]!
+ veor q0, q8
+ veor q1, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q10
+ veor q6, q11
+ vld1.8 {q14,q15}, [r0]!
+ veor q3, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q7, q13
+ veor q2, q14
+ vst1.8 {q4}, [r1]!
+ veor q5, q15
+ vst1.8 {q6}, [r1]!
+ vmov.i32 q8, #1 @ compose 1<<96
+ vst1.8 {q3}, [r1]!
+ veor q9, q9, q9
+ vst1.8 {q7}, [r1]!
+ vext.8 q8, q9, q8, #4
+ vst1.8 {q2}, [r1]!
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vst1.8 {q5}, [r1]!
+ vldmia r9, {q0} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add r2, r2, #8
+ vld1.8 {q8}, [r0]! @ load input
+ veor q0, q8
+ vst1.8 {q0}, [r1]! @ write output
+ cmp r2, #2
+ blo .Lctr_enc_done
+ vld1.8 {q9}, [r0]!
+ veor q1, q9
+ vst1.8 {q1}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q10}, [r0]!
+ veor q4, q10
+ vst1.8 {q4}, [r1]!
+ cmp r2, #4
+ blo .Lctr_enc_done
+ vld1.8 {q11}, [r0]!
+ veor q6, q11
+ vst1.8 {q6}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q12}, [r0]!
+ veor q3, q12
+ vst1.8 {q3}, [r1]!
+ cmp r2, #6
+ blo .Lctr_enc_done
+ vld1.8 {q13}, [r0]!
+ veor q7, q13
+ vst1.8 {q7}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q14}, [r0]
+ veor q2, q14
+ vst1.8 {q2}, [r1]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:@ wipe key schedule [if any]
+ vstmia sp!, {q0,q1}
+ cmp sp, r9
+ bne .Lctr_enc_bzero
+#else
+ vstmia sp, {q0,q1}
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
+
+ @ OpenSSL contains aes_nohw_* fallback code here. We patch this
+ @ out to retain a constant-time implementation.
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/co-586-apple.S b/gen/bcm/co-586-apple.S
new file mode 100644
index 0000000..ab985ee
--- /dev/null
+++ b/gen/bcm/co-586-apple.S
@@ -0,0 +1,1256 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _bn_mul_comba8
+.private_extern _bn_mul_comba8
+.align 4
+_bn_mul_comba8:
+L_bn_mul_comba8_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+ # ################## Calculate word 0
+ xorl %ebp,%ebp
+ # mul a[0]*b[0]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+ # saved r[0]
+ # ################## Calculate word 1
+ xorl %ebx,%ebx
+ # mul a[1]*b[0]
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ # mul a[0]*b[1]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+ # saved r[1]
+ # ################## Calculate word 2
+ xorl %ecx,%ecx
+ # mul a[2]*b[0]
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+ # mul a[1]*b[1]
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ # mul a[0]*b[2]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+ # saved r[2]
+ # ################## Calculate word 3
+ xorl %ebp,%ebp
+ # mul a[3]*b[0]
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ # mul a[2]*b[1]
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+ # mul a[1]*b[2]
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ # mul a[0]*b[3]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 16(%esi),%eax
+ # saved r[3]
+ # ################## Calculate word 4
+ xorl %ebx,%ebx
+ # mul a[4]*b[0]
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ # mul a[3]*b[1]
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ # mul a[2]*b[2]
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+ # mul a[1]*b[3]
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ # mul a[0]*b[4]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 20(%esi),%eax
+ # saved r[4]
+ # ################## Calculate word 5
+ xorl %ecx,%ecx
+ # mul a[5]*b[0]
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+ # mul a[4]*b[1]
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ # mul a[3]*b[2]
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ # mul a[2]*b[3]
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+ # mul a[1]*b[4]
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ # mul a[0]*b[5]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 24(%esi),%eax
+ # saved r[5]
+ # ################## Calculate word 6
+ xorl %ebp,%ebp
+ # mul a[6]*b[0]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ # mul a[5]*b[1]
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+ # mul a[4]*b[2]
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ # mul a[3]*b[3]
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+ # mul a[2]*b[4]
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+ # mul a[1]*b[5]
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ # mul a[0]*b[6]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+ movl 28(%esi),%eax
+ # saved r[6]
+ # ################## Calculate word 7
+ xorl %ebx,%ebx
+ # mul a[7]*b[0]
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ # mul a[6]*b[1]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ # mul a[5]*b[2]
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+ # mul a[4]*b[3]
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ # mul a[3]*b[4]
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+ # mul a[2]*b[5]
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+ # mul a[1]*b[6]
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ # mul a[0]*b[7]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,28(%eax)
+ movl 28(%esi),%eax
+ # saved r[7]
+ # ################## Calculate word 8
+ xorl %ecx,%ecx
+ # mul a[7]*b[1]
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ # mul a[6]*b[2]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ # mul a[5]*b[3]
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+ # mul a[4]*b[4]
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ # mul a[3]*b[5]
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+ # mul a[2]*b[6]
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+ # mul a[1]*b[7]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%eax)
+ movl 28(%esi),%eax
+ # saved r[8]
+ # ################## Calculate word 9
+ xorl %ebp,%ebp
+ # mul a[7]*b[2]
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ # mul a[6]*b[3]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+ # mul a[5]*b[4]
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+ # mul a[4]*b[5]
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ # mul a[3]*b[6]
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+ # mul a[2]*b[7]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,36(%eax)
+ movl 28(%esi),%eax
+ # saved r[9]
+ # ################## Calculate word 10
+ xorl %ebx,%ebx
+ # mul a[7]*b[3]
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ # mul a[6]*b[4]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+ # mul a[5]*b[5]
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+ # mul a[4]*b[6]
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ # mul a[3]*b[7]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%eax)
+ movl 28(%esi),%eax
+ # saved r[10]
+ # ################## Calculate word 11
+ xorl %ecx,%ecx
+ # mul a[7]*b[4]
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ # mul a[6]*b[5]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+ # mul a[5]*b[6]
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+ # mul a[4]*b[7]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,44(%eax)
+ movl 28(%esi),%eax
+ # saved r[11]
+ # ################## Calculate word 12
+ xorl %ebp,%ebp
+ # mul a[7]*b[5]
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ # mul a[6]*b[6]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+ # mul a[5]*b[7]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%eax)
+ movl 28(%esi),%eax
+ # saved r[12]
+ # ################## Calculate word 13
+ xorl %ebx,%ebx
+ # mul a[7]*b[6]
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ # mul a[6]*b[7]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,52(%eax)
+ movl 28(%esi),%eax
+ # saved r[13]
+ # ################## Calculate word 14
+ xorl %ecx,%ecx
+ # mul a[7]*b[7]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%eax)
+ # saved r[14]
+ # save r[15]
+ movl %ebx,60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.globl _bn_mul_comba4
+.private_extern _bn_mul_comba4
+.align 4
+_bn_mul_comba4:
+L_bn_mul_comba4_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+ # ################## Calculate word 0
+ xorl %ebp,%ebp
+ # mul a[0]*b[0]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+ # saved r[0]
+ # ################## Calculate word 1
+ xorl %ebx,%ebx
+ # mul a[1]*b[0]
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ # mul a[0]*b[1]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+ # saved r[1]
+ # ################## Calculate word 2
+ xorl %ecx,%ecx
+ # mul a[2]*b[0]
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+ # mul a[1]*b[1]
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ # mul a[0]*b[2]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+ # saved r[2]
+ # ################## Calculate word 3
+ xorl %ebp,%ebp
+ # mul a[3]*b[0]
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ # mul a[2]*b[1]
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+ # mul a[1]*b[2]
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ # mul a[0]*b[3]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 12(%esi),%eax
+ # saved r[3]
+ # ################## Calculate word 4
+ xorl %ebx,%ebx
+ # mul a[3]*b[1]
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ # mul a[2]*b[2]
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+ # mul a[1]*b[3]
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 12(%esi),%eax
+ # saved r[4]
+ # ################## Calculate word 5
+ xorl %ecx,%ecx
+ # mul a[3]*b[2]
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ # mul a[2]*b[3]
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 12(%esi),%eax
+ # saved r[5]
+ # ################## Calculate word 6
+ xorl %ebp,%ebp
+ # mul a[3]*b[3]
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+ # saved r[6]
+ # save r[7]
+ movl %ecx,28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.globl _bn_sqr_comba8
+.private_extern _bn_sqr_comba8
+.align 4
+_bn_sqr_comba8:
+L_bn_sqr_comba8_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+ # ############### Calculate word 0
+ xorl %ebp,%ebp
+ # sqr a[0]*a[0]
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+ # saved r[0]
+ # ############### Calculate word 1
+ xorl %ebx,%ebx
+ # sqr a[1]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+ # saved r[1]
+ # ############### Calculate word 2
+ xorl %ecx,%ecx
+ # sqr a[2]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+ # sqr a[1]*a[1]
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+ # saved r[2]
+ # ############### Calculate word 3
+ xorl %ebp,%ebp
+ # sqr a[3]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+ # sqr a[2]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl (%esi),%edx
+ # saved r[3]
+ # ############### Calculate word 4
+ xorl %ebx,%ebx
+ # sqr a[4]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 12(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+ # sqr a[3]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ # sqr a[2]*a[2]
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl (%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 20(%esi),%eax
+ # saved r[4]
+ # ############### Calculate word 5
+ xorl %ecx,%ecx
+ # sqr a[5]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+ movl 4(%esi),%edx
+ # sqr a[4]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+ # sqr a[3]*a[2]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+ movl (%esi),%edx
+ # saved r[5]
+ # ############### Calculate word 6
+ xorl %ebp,%ebp
+ # sqr a[6]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+ # sqr a[5]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl 8(%esi),%edx
+ # sqr a[4]*a[2]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+ # sqr a[3]*a[3]
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+ movl 28(%esi),%eax
+ # saved r[6]
+ # ############### Calculate word 7
+ xorl %ebx,%ebx
+ # sqr a[7]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+ # sqr a[6]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+ movl 8(%esi),%edx
+ # sqr a[5]*a[2]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%eax
+ adcl $0,%ebx
+ movl 12(%esi),%edx
+ # sqr a[4]*a[3]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,28(%edi)
+ movl 4(%esi),%edx
+ # saved r[7]
+ # ############### Calculate word 8
+ xorl %ecx,%ecx
+ # sqr a[7]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+ # sqr a[6]*a[2]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 20(%esi),%eax
+ adcl $0,%ecx
+ movl 12(%esi),%edx
+ # sqr a[5]*a[3]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+ # sqr a[4]*a[4]
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 8(%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%edi)
+ movl 28(%esi),%eax
+ # saved r[8]
+ # ############### Calculate word 9
+ xorl %ebp,%ebp
+ # sqr a[7]*a[2]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+ movl 12(%esi),%edx
+ # sqr a[6]*a[3]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 16(%esi),%edx
+ # sqr a[5]*a[4]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 28(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,36(%edi)
+ movl 12(%esi),%edx
+ # saved r[9]
+ # ############### Calculate word 10
+ xorl %ebx,%ebx
+ # sqr a[7]*a[3]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 16(%esi),%edx
+ # sqr a[6]*a[4]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+ # sqr a[5]*a[5]
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%edi)
+ movl 28(%esi),%eax
+ # saved r[10]
+ # ############### Calculate word 11
+ xorl %ecx,%ecx
+ # sqr a[7]*a[4]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 20(%esi),%edx
+ # sqr a[6]*a[5]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 28(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,44(%edi)
+ movl 20(%esi),%edx
+ # saved r[11]
+ # ############### Calculate word 12
+ xorl %ebp,%ebp
+ # sqr a[7]*a[5]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+ # sqr a[6]*a[6]
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%edi)
+ movl 28(%esi),%eax
+ # saved r[12]
+ # ############### Calculate word 13
+ xorl %ebx,%ebx
+ # sqr a[7]*a[6]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,52(%edi)
+ # saved r[13]
+ # ############### Calculate word 14
+ xorl %ecx,%ecx
+ # sqr a[7]*a[7]
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%edi)
+ # saved r[14]
+ movl %ebx,60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.globl _bn_sqr_comba4
+.private_extern _bn_sqr_comba4
+.align 4
+_bn_sqr_comba4:
+L_bn_sqr_comba4_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+ # ############### Calculate word 0
+ xorl %ebp,%ebp
+ # sqr a[0]*a[0]
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+ # saved r[0]
+ # ############### Calculate word 1
+ xorl %ebx,%ebx
+ # sqr a[1]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+ # saved r[1]
+ # ############### Calculate word 2
+ xorl %ecx,%ecx
+ # sqr a[2]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+ # sqr a[1]*a[1]
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+ # saved r[2]
+ # ############### Calculate word 3
+ xorl %ebp,%ebp
+ # sqr a[3]*a[0]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+ # sqr a[2]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl 4(%esi),%edx
+ # saved r[3]
+ # ############### Calculate word 4
+ xorl %ebx,%ebx
+ # sqr a[3]*a[1]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ # sqr a[2]*a[2]
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 12(%esi),%eax
+ # saved r[4]
+ # ############### Calculate word 5
+ xorl %ecx,%ecx
+ # sqr a[3]*a[2]
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+ # saved r[5]
+ # ############### Calculate word 6
+ xorl %ebp,%ebp
+ # sqr a[3]*a[3]
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+ # saved r[6]
+ movl %ecx,28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/co-586-linux.S b/gen/bcm/co-586-linux.S
new file mode 100644
index 0000000..b4812e3
--- /dev/null
+++ b/gen/bcm/co-586-linux.S
@@ -0,0 +1,1264 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl bn_mul_comba8
+.hidden bn_mul_comba8
+.type bn_mul_comba8,@function
+.align 16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 16(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 24(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,28(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,36(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,44(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,52(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%eax)
+
+
+ movl %ebx,60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl bn_mul_comba4
+.hidden bn_mul_comba4
+.type bn_mul_comba4,@function
+.align 16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+
+
+ movl %ecx,28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl bn_sqr_comba8
+.hidden bn_sqr_comba8
+.type bn_sqr_comba8,@function
+.align 16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 12(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl (%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%eax
+ adcl $0,%ebx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,28(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 20(%esi),%eax
+ adcl $0,%ecx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 8(%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 28(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,36(%edi)
+ movl 12(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 20(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 28(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,44(%edi)
+ movl 20(%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,52(%edi)
+
+
+ xorl %ecx,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%edi)
+
+ movl %ebx,60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl bn_sqr_comba4
+.hidden bn_sqr_comba4
+.type bn_sqr_comba4,@function
+.align 16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+
+ movl %ecx,28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/co-586-win.asm b/gen/bcm/co-586-win.asm
new file mode 100644
index 0000000..6ad4696
--- /dev/null
+++ b/gen/bcm/co-586-win.asm
@@ -0,0 +1,1263 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _bn_mul_comba8
+align 16
+_bn_mul_comba8:
+L$_bn_mul_comba8_begin:
+ push esi
+ mov esi,DWORD [12+esp]
+ push edi
+ mov edi,DWORD [20+esp]
+ push ebp
+ push ebx
+ xor ebx,ebx
+ mov eax,DWORD [esi]
+ xor ecx,ecx
+ mov edx,DWORD [edi]
+ ; ################## Calculate word 0
+ xor ebp,ebp
+ ; mul a[0]*b[0]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ mov edx,DWORD [edi]
+ adc ebp,0
+ mov DWORD [eax],ebx
+ mov eax,DWORD [4+esi]
+ ; saved r[0]
+ ; ################## Calculate word 1
+ xor ebx,ebx
+ ; mul a[1]*b[0]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [esi]
+ adc ebp,edx
+ mov edx,DWORD [4+edi]
+ adc ebx,0
+ ; mul a[0]*b[1]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esp]
+ adc ebp,edx
+ mov edx,DWORD [edi]
+ adc ebx,0
+ mov DWORD [4+eax],ecx
+ mov eax,DWORD [8+esi]
+ ; saved r[1]
+ ; ################## Calculate word 2
+ xor ecx,ecx
+ ; mul a[2]*b[0]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [4+esi]
+ adc ebx,edx
+ mov edx,DWORD [4+edi]
+ adc ecx,0
+ ; mul a[1]*b[1]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [esi]
+ adc ebx,edx
+ mov edx,DWORD [8+edi]
+ adc ecx,0
+ ; mul a[0]*b[2]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esp]
+ adc ebx,edx
+ mov edx,DWORD [edi]
+ adc ecx,0
+ mov DWORD [8+eax],ebp
+ mov eax,DWORD [12+esi]
+ ; saved r[2]
+ ; ################## Calculate word 3
+ xor ebp,ebp
+ ; mul a[3]*b[0]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [8+esi]
+ adc ecx,edx
+ mov edx,DWORD [4+edi]
+ adc ebp,0
+ ; mul a[2]*b[1]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [4+esi]
+ adc ecx,edx
+ mov edx,DWORD [8+edi]
+ adc ebp,0
+ ; mul a[1]*b[2]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [esi]
+ adc ecx,edx
+ mov edx,DWORD [12+edi]
+ adc ebp,0
+ ; mul a[0]*b[3]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ mov edx,DWORD [edi]
+ adc ebp,0
+ mov DWORD [12+eax],ebx
+ mov eax,DWORD [16+esi]
+ ; saved r[3]
+ ; ################## Calculate word 4
+ xor ebx,ebx
+ ; mul a[4]*b[0]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [12+esi]
+ adc ebp,edx
+ mov edx,DWORD [4+edi]
+ adc ebx,0
+ ; mul a[3]*b[1]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [8+esi]
+ adc ebp,edx
+ mov edx,DWORD [8+edi]
+ adc ebx,0
+ ; mul a[2]*b[2]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [4+esi]
+ adc ebp,edx
+ mov edx,DWORD [12+edi]
+ adc ebx,0
+ ; mul a[1]*b[3]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [esi]
+ adc ebp,edx
+ mov edx,DWORD [16+edi]
+ adc ebx,0
+ ; mul a[0]*b[4]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esp]
+ adc ebp,edx
+ mov edx,DWORD [edi]
+ adc ebx,0
+ mov DWORD [16+eax],ecx
+ mov eax,DWORD [20+esi]
+ ; saved r[4]
+ ; ################## Calculate word 5
+ xor ecx,ecx
+ ; mul a[5]*b[0]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [16+esi]
+ adc ebx,edx
+ mov edx,DWORD [4+edi]
+ adc ecx,0
+ ; mul a[4]*b[1]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [12+esi]
+ adc ebx,edx
+ mov edx,DWORD [8+edi]
+ adc ecx,0
+ ; mul a[3]*b[2]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [8+esi]
+ adc ebx,edx
+ mov edx,DWORD [12+edi]
+ adc ecx,0
+ ; mul a[2]*b[3]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [4+esi]
+ adc ebx,edx
+ mov edx,DWORD [16+edi]
+ adc ecx,0
+ ; mul a[1]*b[4]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [esi]
+ adc ebx,edx
+ mov edx,DWORD [20+edi]
+ adc ecx,0
+ ; mul a[0]*b[5]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esp]
+ adc ebx,edx
+ mov edx,DWORD [edi]
+ adc ecx,0
+ mov DWORD [20+eax],ebp
+ mov eax,DWORD [24+esi]
+ ; saved r[5]
+ ; ################## Calculate word 6
+ xor ebp,ebp
+ ; mul a[6]*b[0]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esi]
+ adc ecx,edx
+ mov edx,DWORD [4+edi]
+ adc ebp,0
+ ; mul a[5]*b[1]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [16+esi]
+ adc ecx,edx
+ mov edx,DWORD [8+edi]
+ adc ebp,0
+ ; mul a[4]*b[2]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [12+esi]
+ adc ecx,edx
+ mov edx,DWORD [12+edi]
+ adc ebp,0
+ ; mul a[3]*b[3]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [8+esi]
+ adc ecx,edx
+ mov edx,DWORD [16+edi]
+ adc ebp,0
+ ; mul a[2]*b[4]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [4+esi]
+ adc ecx,edx
+ mov edx,DWORD [20+edi]
+ adc ebp,0
+ ; mul a[1]*b[5]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [esi]
+ adc ecx,edx
+ mov edx,DWORD [24+edi]
+ adc ebp,0
+ ; mul a[0]*b[6]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ mov edx,DWORD [edi]
+ adc ebp,0
+ mov DWORD [24+eax],ebx
+ mov eax,DWORD [28+esi]
+ ; saved r[6]
+ ; ################## Calculate word 7
+ xor ebx,ebx
+ ; mul a[7]*b[0]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [24+esi]
+ adc ebp,edx
+ mov edx,DWORD [4+edi]
+ adc ebx,0
+ ; mul a[6]*b[1]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esi]
+ adc ebp,edx
+ mov edx,DWORD [8+edi]
+ adc ebx,0
+ ; mul a[5]*b[2]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [16+esi]
+ adc ebp,edx
+ mov edx,DWORD [12+edi]
+ adc ebx,0
+ ; mul a[4]*b[3]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [12+esi]
+ adc ebp,edx
+ mov edx,DWORD [16+edi]
+ adc ebx,0
+ ; mul a[3]*b[4]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [8+esi]
+ adc ebp,edx
+ mov edx,DWORD [20+edi]
+ adc ebx,0
+ ; mul a[2]*b[5]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [4+esi]
+ adc ebp,edx
+ mov edx,DWORD [24+edi]
+ adc ebx,0
+ ; mul a[1]*b[6]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [esi]
+ adc ebp,edx
+ mov edx,DWORD [28+edi]
+ adc ebx,0
+ ; mul a[0]*b[7]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esp]
+ adc ebp,edx
+ mov edx,DWORD [4+edi]
+ adc ebx,0
+ mov DWORD [28+eax],ecx
+ mov eax,DWORD [28+esi]
+ ; saved r[7]
+ ; ################## Calculate word 8
+ xor ecx,ecx
+ ; mul a[7]*b[1]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [24+esi]
+ adc ebx,edx
+ mov edx,DWORD [8+edi]
+ adc ecx,0
+ ; mul a[6]*b[2]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esi]
+ adc ebx,edx
+ mov edx,DWORD [12+edi]
+ adc ecx,0
+ ; mul a[5]*b[3]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [16+esi]
+ adc ebx,edx
+ mov edx,DWORD [16+edi]
+ adc ecx,0
+ ; mul a[4]*b[4]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [12+esi]
+ adc ebx,edx
+ mov edx,DWORD [20+edi]
+ adc ecx,0
+ ; mul a[3]*b[5]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [8+esi]
+ adc ebx,edx
+ mov edx,DWORD [24+edi]
+ adc ecx,0
+ ; mul a[2]*b[6]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [4+esi]
+ adc ebx,edx
+ mov edx,DWORD [28+edi]
+ adc ecx,0
+ ; mul a[1]*b[7]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esp]
+ adc ebx,edx
+ mov edx,DWORD [8+edi]
+ adc ecx,0
+ mov DWORD [32+eax],ebp
+ mov eax,DWORD [28+esi]
+ ; saved r[8]
+ ; ################## Calculate word 9
+ xor ebp,ebp
+ ; mul a[7]*b[2]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [24+esi]
+ adc ecx,edx
+ mov edx,DWORD [12+edi]
+ adc ebp,0
+ ; mul a[6]*b[3]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esi]
+ adc ecx,edx
+ mov edx,DWORD [16+edi]
+ adc ebp,0
+ ; mul a[5]*b[4]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [16+esi]
+ adc ecx,edx
+ mov edx,DWORD [20+edi]
+ adc ebp,0
+ ; mul a[4]*b[5]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [12+esi]
+ adc ecx,edx
+ mov edx,DWORD [24+edi]
+ adc ebp,0
+ ; mul a[3]*b[6]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [8+esi]
+ adc ecx,edx
+ mov edx,DWORD [28+edi]
+ adc ebp,0
+ ; mul a[2]*b[7]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ mov edx,DWORD [12+edi]
+ adc ebp,0
+ mov DWORD [36+eax],ebx
+ mov eax,DWORD [28+esi]
+ ; saved r[9]
+ ; ################## Calculate word 10
+ xor ebx,ebx
+ ; mul a[7]*b[3]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [24+esi]
+ adc ebp,edx
+ mov edx,DWORD [16+edi]
+ adc ebx,0
+ ; mul a[6]*b[4]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esi]
+ adc ebp,edx
+ mov edx,DWORD [20+edi]
+ adc ebx,0
+ ; mul a[5]*b[5]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [16+esi]
+ adc ebp,edx
+ mov edx,DWORD [24+edi]
+ adc ebx,0
+ ; mul a[4]*b[6]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [12+esi]
+ adc ebp,edx
+ mov edx,DWORD [28+edi]
+ adc ebx,0
+ ; mul a[3]*b[7]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esp]
+ adc ebp,edx
+ mov edx,DWORD [16+edi]
+ adc ebx,0
+ mov DWORD [40+eax],ecx
+ mov eax,DWORD [28+esi]
+ ; saved r[10]
+ ; ################## Calculate word 11
+ xor ecx,ecx
+ ; mul a[7]*b[4]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [24+esi]
+ adc ebx,edx
+ mov edx,DWORD [20+edi]
+ adc ecx,0
+ ; mul a[6]*b[5]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esi]
+ adc ebx,edx
+ mov edx,DWORD [24+edi]
+ adc ecx,0
+ ; mul a[5]*b[6]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [16+esi]
+ adc ebx,edx
+ mov edx,DWORD [28+edi]
+ adc ecx,0
+ ; mul a[4]*b[7]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esp]
+ adc ebx,edx
+ mov edx,DWORD [20+edi]
+ adc ecx,0
+ mov DWORD [44+eax],ebp
+ mov eax,DWORD [28+esi]
+ ; saved r[11]
+ ; ################## Calculate word 12
+ xor ebp,ebp
+ ; mul a[7]*b[5]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [24+esi]
+ adc ecx,edx
+ mov edx,DWORD [24+edi]
+ adc ebp,0
+ ; mul a[6]*b[6]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esi]
+ adc ecx,edx
+ mov edx,DWORD [28+edi]
+ adc ebp,0
+ ; mul a[5]*b[7]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ mov edx,DWORD [24+edi]
+ adc ebp,0
+ mov DWORD [48+eax],ebx
+ mov eax,DWORD [28+esi]
+ ; saved r[12]
+ ; ################## Calculate word 13
+ xor ebx,ebx
+ ; mul a[7]*b[6]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [24+esi]
+ adc ebp,edx
+ mov edx,DWORD [28+edi]
+ adc ebx,0
+ ; mul a[6]*b[7]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esp]
+ adc ebp,edx
+ mov edx,DWORD [28+edi]
+ adc ebx,0
+ mov DWORD [52+eax],ecx
+ mov eax,DWORD [28+esi]
+ ; saved r[13]
+ ; ################## Calculate word 14
+ xor ecx,ecx
+ ; mul a[7]*b[7]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esp]
+ adc ebx,edx
+ adc ecx,0
+ mov DWORD [56+eax],ebp
+ ; saved r[14]
+ ; save r[15]
+ mov DWORD [60+eax],ebx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+global _bn_mul_comba4
+align 16
+_bn_mul_comba4:
+L$_bn_mul_comba4_begin:
+ push esi
+ mov esi,DWORD [12+esp]
+ push edi
+ mov edi,DWORD [20+esp]
+ push ebp
+ push ebx
+ xor ebx,ebx
+ mov eax,DWORD [esi]
+ xor ecx,ecx
+ mov edx,DWORD [edi]
+ ; ################## Calculate word 0
+ xor ebp,ebp
+ ; mul a[0]*b[0]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ mov edx,DWORD [edi]
+ adc ebp,0
+ mov DWORD [eax],ebx
+ mov eax,DWORD [4+esi]
+ ; saved r[0]
+ ; ################## Calculate word 1
+ xor ebx,ebx
+ ; mul a[1]*b[0]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [esi]
+ adc ebp,edx
+ mov edx,DWORD [4+edi]
+ adc ebx,0
+ ; mul a[0]*b[1]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esp]
+ adc ebp,edx
+ mov edx,DWORD [edi]
+ adc ebx,0
+ mov DWORD [4+eax],ecx
+ mov eax,DWORD [8+esi]
+ ; saved r[1]
+ ; ################## Calculate word 2
+ xor ecx,ecx
+ ; mul a[2]*b[0]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [4+esi]
+ adc ebx,edx
+ mov edx,DWORD [4+edi]
+ adc ecx,0
+ ; mul a[1]*b[1]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [esi]
+ adc ebx,edx
+ mov edx,DWORD [8+edi]
+ adc ecx,0
+ ; mul a[0]*b[2]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esp]
+ adc ebx,edx
+ mov edx,DWORD [edi]
+ adc ecx,0
+ mov DWORD [8+eax],ebp
+ mov eax,DWORD [12+esi]
+ ; saved r[2]
+ ; ################## Calculate word 3
+ xor ebp,ebp
+ ; mul a[3]*b[0]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [8+esi]
+ adc ecx,edx
+ mov edx,DWORD [4+edi]
+ adc ebp,0
+ ; mul a[2]*b[1]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [4+esi]
+ adc ecx,edx
+ mov edx,DWORD [8+edi]
+ adc ebp,0
+ ; mul a[1]*b[2]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [esi]
+ adc ecx,edx
+ mov edx,DWORD [12+edi]
+ adc ebp,0
+ ; mul a[0]*b[3]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ mov edx,DWORD [4+edi]
+ adc ebp,0
+ mov DWORD [12+eax],ebx
+ mov eax,DWORD [12+esi]
+ ; saved r[3]
+ ; ################## Calculate word 4
+ xor ebx,ebx
+ ; mul a[3]*b[1]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [8+esi]
+ adc ebp,edx
+ mov edx,DWORD [8+edi]
+ adc ebx,0
+ ; mul a[2]*b[2]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [4+esi]
+ adc ebp,edx
+ mov edx,DWORD [12+edi]
+ adc ebx,0
+ ; mul a[1]*b[3]
+ mul edx
+ add ecx,eax
+ mov eax,DWORD [20+esp]
+ adc ebp,edx
+ mov edx,DWORD [8+edi]
+ adc ebx,0
+ mov DWORD [16+eax],ecx
+ mov eax,DWORD [12+esi]
+ ; saved r[4]
+ ; ################## Calculate word 5
+ xor ecx,ecx
+ ; mul a[3]*b[2]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [8+esi]
+ adc ebx,edx
+ mov edx,DWORD [12+edi]
+ adc ecx,0
+ ; mul a[2]*b[3]
+ mul edx
+ add ebp,eax
+ mov eax,DWORD [20+esp]
+ adc ebx,edx
+ mov edx,DWORD [12+edi]
+ adc ecx,0
+ mov DWORD [20+eax],ebp
+ mov eax,DWORD [12+esi]
+ ; saved r[5]
+ ; ################## Calculate word 6
+ xor ebp,ebp
+ ; mul a[3]*b[3]
+ mul edx
+ add ebx,eax
+ mov eax,DWORD [20+esp]
+ adc ecx,edx
+ adc ebp,0
+ mov DWORD [24+eax],ebx
+ ; saved r[6]
+ ; save r[7]
+ mov DWORD [28+eax],ecx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+global _bn_sqr_comba8
+align 16
+_bn_sqr_comba8:
+L$_bn_sqr_comba8_begin:
+ push esi
+ push edi
+ push ebp
+ push ebx
+ mov edi,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ xor ebx,ebx
+ xor ecx,ecx
+ mov eax,DWORD [esi]
+ ; ############### Calculate word 0
+ xor ebp,ebp
+ ; sqr a[0]*a[0]
+ mul eax
+ add ebx,eax
+ adc ecx,edx
+ mov edx,DWORD [esi]
+ adc ebp,0
+ mov DWORD [edi],ebx
+ mov eax,DWORD [4+esi]
+ ; saved r[0]
+ ; ############### Calculate word 1
+ xor ebx,ebx
+ ; sqr a[1]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [8+esi]
+ adc ebx,0
+ mov DWORD [4+edi],ecx
+ mov edx,DWORD [esi]
+ ; saved r[1]
+ ; ############### Calculate word 2
+ xor ecx,ecx
+ ; sqr a[2]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [4+esi]
+ adc ecx,0
+ ; sqr a[1]*a[1]
+ mul eax
+ add ebp,eax
+ adc ebx,edx
+ mov edx,DWORD [esi]
+ adc ecx,0
+ mov DWORD [8+edi],ebp
+ mov eax,DWORD [12+esi]
+ ; saved r[2]
+ ; ############### Calculate word 3
+ xor ebp,ebp
+ ; sqr a[3]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [8+esi]
+ adc ebp,0
+ mov edx,DWORD [4+esi]
+ ; sqr a[2]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [16+esi]
+ adc ebp,0
+ mov DWORD [12+edi],ebx
+ mov edx,DWORD [esi]
+ ; saved r[3]
+ ; ############### Calculate word 4
+ xor ebx,ebx
+ ; sqr a[4]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [12+esi]
+ adc ebx,0
+ mov edx,DWORD [4+esi]
+ ; sqr a[3]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [8+esi]
+ adc ebx,0
+ ; sqr a[2]*a[2]
+ mul eax
+ add ecx,eax
+ adc ebp,edx
+ mov edx,DWORD [esi]
+ adc ebx,0
+ mov DWORD [16+edi],ecx
+ mov eax,DWORD [20+esi]
+ ; saved r[4]
+ ; ############### Calculate word 5
+ xor ecx,ecx
+ ; sqr a[5]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [16+esi]
+ adc ecx,0
+ mov edx,DWORD [4+esi]
+ ; sqr a[4]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [12+esi]
+ adc ecx,0
+ mov edx,DWORD [8+esi]
+ ; sqr a[3]*a[2]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [24+esi]
+ adc ecx,0
+ mov DWORD [20+edi],ebp
+ mov edx,DWORD [esi]
+ ; saved r[5]
+ ; ############### Calculate word 6
+ xor ebp,ebp
+ ; sqr a[6]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [20+esi]
+ adc ebp,0
+ mov edx,DWORD [4+esi]
+ ; sqr a[5]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [16+esi]
+ adc ebp,0
+ mov edx,DWORD [8+esi]
+ ; sqr a[4]*a[2]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [12+esi]
+ adc ebp,0
+ ; sqr a[3]*a[3]
+ mul eax
+ add ebx,eax
+ adc ecx,edx
+ mov edx,DWORD [esi]
+ adc ebp,0
+ mov DWORD [24+edi],ebx
+ mov eax,DWORD [28+esi]
+ ; saved r[6]
+ ; ############### Calculate word 7
+ xor ebx,ebx
+ ; sqr a[7]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [24+esi]
+ adc ebx,0
+ mov edx,DWORD [4+esi]
+ ; sqr a[6]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [20+esi]
+ adc ebx,0
+ mov edx,DWORD [8+esi]
+ ; sqr a[5]*a[2]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [16+esi]
+ adc ebx,0
+ mov edx,DWORD [12+esi]
+ ; sqr a[4]*a[3]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [28+esi]
+ adc ebx,0
+ mov DWORD [28+edi],ecx
+ mov edx,DWORD [4+esi]
+ ; saved r[7]
+ ; ############### Calculate word 8
+ xor ecx,ecx
+ ; sqr a[7]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [24+esi]
+ adc ecx,0
+ mov edx,DWORD [8+esi]
+ ; sqr a[6]*a[2]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [20+esi]
+ adc ecx,0
+ mov edx,DWORD [12+esi]
+ ; sqr a[5]*a[3]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [16+esi]
+ adc ecx,0
+ ; sqr a[4]*a[4]
+ mul eax
+ add ebp,eax
+ adc ebx,edx
+ mov edx,DWORD [8+esi]
+ adc ecx,0
+ mov DWORD [32+edi],ebp
+ mov eax,DWORD [28+esi]
+ ; saved r[8]
+ ; ############### Calculate word 9
+ xor ebp,ebp
+ ; sqr a[7]*a[2]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [24+esi]
+ adc ebp,0
+ mov edx,DWORD [12+esi]
+ ; sqr a[6]*a[3]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [20+esi]
+ adc ebp,0
+ mov edx,DWORD [16+esi]
+ ; sqr a[5]*a[4]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [28+esi]
+ adc ebp,0
+ mov DWORD [36+edi],ebx
+ mov edx,DWORD [12+esi]
+ ; saved r[9]
+ ; ############### Calculate word 10
+ xor ebx,ebx
+ ; sqr a[7]*a[3]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [24+esi]
+ adc ebx,0
+ mov edx,DWORD [16+esi]
+ ; sqr a[6]*a[4]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [20+esi]
+ adc ebx,0
+ ; sqr a[5]*a[5]
+ mul eax
+ add ecx,eax
+ adc ebp,edx
+ mov edx,DWORD [16+esi]
+ adc ebx,0
+ mov DWORD [40+edi],ecx
+ mov eax,DWORD [28+esi]
+ ; saved r[10]
+ ; ############### Calculate word 11
+ xor ecx,ecx
+ ; sqr a[7]*a[4]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [24+esi]
+ adc ecx,0
+ mov edx,DWORD [20+esi]
+ ; sqr a[6]*a[5]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [28+esi]
+ adc ecx,0
+ mov DWORD [44+edi],ebp
+ mov edx,DWORD [20+esi]
+ ; saved r[11]
+ ; ############### Calculate word 12
+ xor ebp,ebp
+ ; sqr a[7]*a[5]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [24+esi]
+ adc ebp,0
+ ; sqr a[6]*a[6]
+ mul eax
+ add ebx,eax
+ adc ecx,edx
+ mov edx,DWORD [24+esi]
+ adc ebp,0
+ mov DWORD [48+edi],ebx
+ mov eax,DWORD [28+esi]
+ ; saved r[12]
+ ; ############### Calculate word 13
+ xor ebx,ebx
+ ; sqr a[7]*a[6]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [28+esi]
+ adc ebx,0
+ mov DWORD [52+edi],ecx
+ ; saved r[13]
+ ; ############### Calculate word 14
+ xor ecx,ecx
+ ; sqr a[7]*a[7]
+ mul eax
+ add ebp,eax
+ adc ebx,edx
+ adc ecx,0
+ mov DWORD [56+edi],ebp
+ ; saved r[14]
+ mov DWORD [60+edi],ebx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+global _bn_sqr_comba4
+align 16
+_bn_sqr_comba4:
+L$_bn_sqr_comba4_begin:
+ push esi
+ push edi
+ push ebp
+ push ebx
+ mov edi,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ xor ebx,ebx
+ xor ecx,ecx
+ mov eax,DWORD [esi]
+ ; ############### Calculate word 0
+ xor ebp,ebp
+ ; sqr a[0]*a[0]
+ mul eax
+ add ebx,eax
+ adc ecx,edx
+ mov edx,DWORD [esi]
+ adc ebp,0
+ mov DWORD [edi],ebx
+ mov eax,DWORD [4+esi]
+ ; saved r[0]
+ ; ############### Calculate word 1
+ xor ebx,ebx
+ ; sqr a[1]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [8+esi]
+ adc ebx,0
+ mov DWORD [4+edi],ecx
+ mov edx,DWORD [esi]
+ ; saved r[1]
+ ; ############### Calculate word 2
+ xor ecx,ecx
+ ; sqr a[2]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [4+esi]
+ adc ecx,0
+ ; sqr a[1]*a[1]
+ mul eax
+ add ebp,eax
+ adc ebx,edx
+ mov edx,DWORD [esi]
+ adc ecx,0
+ mov DWORD [8+edi],ebp
+ mov eax,DWORD [12+esi]
+ ; saved r[2]
+ ; ############### Calculate word 3
+ xor ebp,ebp
+ ; sqr a[3]*a[0]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [8+esi]
+ adc ebp,0
+ mov edx,DWORD [4+esi]
+ ; sqr a[2]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebp,0
+ add ebx,eax
+ adc ecx,edx
+ mov eax,DWORD [12+esi]
+ adc ebp,0
+ mov DWORD [12+edi],ebx
+ mov edx,DWORD [4+esi]
+ ; saved r[3]
+ ; ############### Calculate word 4
+ xor ebx,ebx
+ ; sqr a[3]*a[1]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ebx,0
+ add ecx,eax
+ adc ebp,edx
+ mov eax,DWORD [8+esi]
+ adc ebx,0
+ ; sqr a[2]*a[2]
+ mul eax
+ add ecx,eax
+ adc ebp,edx
+ mov edx,DWORD [8+esi]
+ adc ebx,0
+ mov DWORD [16+edi],ecx
+ mov eax,DWORD [12+esi]
+ ; saved r[4]
+ ; ############### Calculate word 5
+ xor ecx,ecx
+ ; sqr a[3]*a[2]
+ mul edx
+ add eax,eax
+ adc edx,edx
+ adc ecx,0
+ add ebp,eax
+ adc ebx,edx
+ mov eax,DWORD [12+esi]
+ adc ecx,0
+ mov DWORD [20+edi],ebp
+ ; saved r[5]
+ ; ############### Calculate word 6
+ xor ebp,ebp
+ ; sqr a[3]*a[3]
+ mul eax
+ add ebx,eax
+ adc ecx,edx
+ adc ebp,0
+ mov DWORD [24+edi],ebx
+ ; saved r[6]
+ mov DWORD [28+edi],ecx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-armv4-linux.S b/gen/bcm/ghash-armv4-linux.S
new file mode 100644
index 0000000..7c04f89
--- /dev/null
+++ b/gen/bcm/ghash-armv4-linux.S
@@ -0,0 +1,244 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl gcm_init_neon
+.hidden gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ vld1.64 d7,[r1]! @ load H
+ vmov.i8 q8,#0xe1
+ vld1.64 d6,[r1]
+ vshl.i64 d17,#57
+ vshr.u64 d16,#63 @ t0=0xc2....01
+ vdup.8 q9,d7[7]
+ vshr.u64 d26,d6,#63
+ vshr.s8 q9,#7 @ broadcast carry bit
+ vshl.i64 q3,q3,#1
+ vand q8,q8,q9
+ vorr d7,d26 @ H<<<=1
+ veor q3,q3,q8 @ twisted H
+ vstmia r0,{q3}
+
+ bx lr @ bx lr
+.size gcm_init_neon,.-gcm_init_neon
+
+.globl gcm_gmult_neon
+.hidden gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ vld1.64 d7,[r0]! @ load Xi
+ vld1.64 d6,[r0]!
+ vmov.i64 d29,#0x0000ffffffffffff
+ vldmia r1,{d26,d27} @ load twisted H
+ vmov.i64 d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 q3,q3
+#endif
+ vmov.i64 d31,#0x000000000000ffff
+ veor d28,d26,d27 @ Karatsuba pre-processing
+ mov r3,#16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl gcm_ghash_neon
+.hidden gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ vld1.64 d1,[r0]! @ load Xi
+ vld1.64 d0,[r0]!
+ vmov.i64 d29,#0x0000ffffffffffff
+ vldmia r1,{d26,d27} @ load twisted H
+ vmov.i64 d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 q0,q0
+#endif
+ vmov.i64 d31,#0x000000000000ffff
+ veor d28,d26,d27 @ Karatsuba pre-processing
+
+.Loop_neon:
+ vld1.64 d7,[r2]! @ load inp
+ vld1.64 d6,[r2]!
+#ifdef __ARMEL__
+ vrev64.8 q3,q3
+#endif
+ veor q3,q0 @ inp^=Xi
+.Lgmult_neon:
+ vext.8 d16, d26, d26, #1 @ A1
+ vmull.p8 q8, d16, d6 @ F = A1*B
+ vext.8 d0, d6, d6, #1 @ B1
+ vmull.p8 q0, d26, d0 @ E = A*B1
+ vext.8 d18, d26, d26, #2 @ A2
+ vmull.p8 q9, d18, d6 @ H = A2*B
+ vext.8 d22, d6, d6, #2 @ B2
+ vmull.p8 q11, d26, d22 @ G = A*B2
+ vext.8 d20, d26, d26, #3 @ A3
+ veor q8, q8, q0 @ L = E + F
+ vmull.p8 q10, d20, d6 @ J = A3*B
+ vext.8 d0, d6, d6, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q0, d26, d0 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d6, d6, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d26, d22 @ K = A*B4
+ veor q10, q10, q0 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q0, d26, d6 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q0, q0, q8
+ veor q0, q0, q10
+ veor d6,d6,d7 @ Karatsuba pre-processing
+ vext.8 d16, d28, d28, #1 @ A1
+ vmull.p8 q8, d16, d6 @ F = A1*B
+ vext.8 d2, d6, d6, #1 @ B1
+ vmull.p8 q1, d28, d2 @ E = A*B1
+ vext.8 d18, d28, d28, #2 @ A2
+ vmull.p8 q9, d18, d6 @ H = A2*B
+ vext.8 d22, d6, d6, #2 @ B2
+ vmull.p8 q11, d28, d22 @ G = A*B2
+ vext.8 d20, d28, d28, #3 @ A3
+ veor q8, q8, q1 @ L = E + F
+ vmull.p8 q10, d20, d6 @ J = A3*B
+ vext.8 d2, d6, d6, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q1, d28, d2 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d6, d6, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d28, d22 @ K = A*B4
+ veor q10, q10, q1 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q1, d28, d6 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q1, q1, q8
+ veor q1, q1, q10
+ vext.8 d16, d27, d27, #1 @ A1
+ vmull.p8 q8, d16, d7 @ F = A1*B
+ vext.8 d4, d7, d7, #1 @ B1
+ vmull.p8 q2, d27, d4 @ E = A*B1
+ vext.8 d18, d27, d27, #2 @ A2
+ vmull.p8 q9, d18, d7 @ H = A2*B
+ vext.8 d22, d7, d7, #2 @ B2
+ vmull.p8 q11, d27, d22 @ G = A*B2
+ vext.8 d20, d27, d27, #3 @ A3
+ veor q8, q8, q2 @ L = E + F
+ vmull.p8 q10, d20, d7 @ J = A3*B
+ vext.8 d4, d7, d7, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q2, d27, d4 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d7, d7, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d27, d22 @ K = A*B4
+ veor q10, q10, q2 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q2, d27, d7 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q2, q2, q8
+ veor q2, q2, q10
+ veor q1,q1,q0 @ Karatsuba post-processing
+ veor q1,q1,q2
+ veor d1,d1,d2
+ veor d4,d4,d3 @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 q9,q0,#57 @ 1st phase
+ vshl.i64 q10,q0,#62
+ veor q10,q10,q9 @
+ vshl.i64 q9,q0,#63
+ veor q10, q10, q9 @
+ veor d1,d1,d20 @
+ veor d4,d4,d21
+
+ vshr.u64 q10,q0,#1 @ 2nd phase
+ veor q2,q2,q0
+ veor q0,q0,q10 @
+ vshr.u64 q10,q10,#6
+ vshr.u64 q0,q0,#1 @
+ veor q0,q0,q2 @
+ veor q0,q0,q10 @
+
+ subs r3,#16
+ bne .Loop_neon
+
+#ifdef __ARMEL__
+ vrev64.8 q0,q0
+#endif
+ sub r0,#16
+ vst1.64 d1,[r0]! @ write out Xi
+ vst1.64 d0,[r0]
+
+ bx lr @ bx lr
+.size gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/ghash-neon-armv8-apple.S b/gen/bcm/ghash-neon-armv8-apple.S
new file mode 100644
index 0000000..a76b8d1
--- /dev/null
+++ b/gen/bcm/ghash-neon-armv8-apple.S
@@ -0,0 +1,335 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl _gcm_init_neon
+.private_extern _gcm_init_neon
+
+.align 4
+_gcm_init_neon:
+ AARCH64_VALID_CALL_TARGET
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {v17.2d}, [x1] // load H
+ movi v19.16b, #0xe1
+ shl v19.2d, v19.2d, #57 // 0xc2.0
+ ext v3.16b, v17.16b, v17.16b, #8
+ ushr v18.2d, v19.2d, #63
+ dup v17.4s, v17.s[1]
+ ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
+ ushr v18.2d, v3.2d, #63
+ sshr v17.4s, v17.4s, #31 // broadcast carry bit
+ and v18.16b, v18.16b, v16.16b
+ shl v3.2d, v3.2d, #1
+ ext v18.16b, v18.16b, v18.16b, #8
+ and v16.16b, v16.16b, v17.16b
+ orr v3.16b, v3.16b, v18.16b // H<<<=1
+ eor v5.16b, v3.16b, v16.16b // twisted H
+ st1 {v5.2d}, [x0] // store Htable[0]
+ ret
+
+
+.globl _gcm_gmult_neon
+.private_extern _gcm_gmult_neon
+
+.align 4
+_gcm_gmult_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks@PAGE // load constants
+ add x9, x9, Lmasks@PAGEOFF
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v3.16b, v3.16b // byteswap Xi
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+ mov x3, #16
+ b Lgmult_neon
+
+
+.globl _gcm_ghash_neon
+.private_extern _gcm_ghash_neon
+
+.align 4
+_gcm_ghash_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks@PAGE // load constants
+ add x9, x9, Lmasks@PAGEOFF
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v0.16b, v0.16b // byteswap Xi
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+Loop_neon:
+ ld1 {v3.16b}, [x2], #16 // load inp
+ rev64 v3.16b, v3.16b // byteswap inp
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v3.16b, v3.16b, v0.16b // inp ^= Xi
+
+Lgmult_neon:
+ // Split the input into v3 and v4. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins v4.d[0], v3.d[1]
+ ext v16.8b, v5.8b, v5.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v0.8b, v3.8b, v3.8b, #1 // B1
+ pmull v0.8h, v5.8b, v0.8b // E = A*B1
+ ext v17.8b, v5.8b, v5.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v5.8b, v19.8b // G = A*B2
+ ext v18.8b, v5.8b, v5.8b, #3 // A3
+ eor v16.16b, v16.16b, v0.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v0.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v0.8h, v5.8b, v0.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v0.16b // N = I + J
+ pmull v19.8h, v5.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v0.8h, v5.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v0.16b, v0.16b, v18.16b
+ eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
+ ext v16.8b, v7.8b, v7.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v1.8b, v3.8b, v3.8b, #1 // B1
+ pmull v1.8h, v7.8b, v1.8b // E = A*B1
+ ext v17.8b, v7.8b, v7.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v7.8b, v19.8b // G = A*B2
+ ext v18.8b, v7.8b, v7.8b, #3 // A3
+ eor v16.16b, v16.16b, v1.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v1.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v1.8h, v7.8b, v1.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v1.16b // N = I + J
+ pmull v19.8h, v7.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v1.8h, v7.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v1.16b, v1.16b, v18.16b
+ ext v16.8b, v6.8b, v6.8b, #1 // A1
+ pmull v16.8h, v16.8b, v4.8b // F = A1*B
+ ext v2.8b, v4.8b, v4.8b, #1 // B1
+ pmull v2.8h, v6.8b, v2.8b // E = A*B1
+ ext v17.8b, v6.8b, v6.8b, #2 // A2
+ pmull v17.8h, v17.8b, v4.8b // H = A2*B
+ ext v19.8b, v4.8b, v4.8b, #2 // B2
+ pmull v19.8h, v6.8b, v19.8b // G = A*B2
+ ext v18.8b, v6.8b, v6.8b, #3 // A3
+ eor v16.16b, v16.16b, v2.16b // L = E + F
+ pmull v18.8h, v18.8b, v4.8b // J = A3*B
+ ext v2.8b, v4.8b, v4.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v2.8h, v6.8b, v2.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v4.8b, v4.8b, #4 // B4
+ eor v18.16b, v18.16b, v2.16b // N = I + J
+ pmull v19.8h, v6.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v2.8h, v6.8b, v4.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v2.16b, v2.16b, v16.16b
+ eor v2.16b, v2.16b, v18.16b
+ ext v16.16b, v0.16b, v2.16b, #8
+ eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
+ ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins v2.d[0], v1.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl v17.2d, v0.2d, #57 // 1st phase
+ shl v18.2d, v0.2d, #62
+ eor v18.16b, v18.16b, v17.16b //
+ shl v17.2d, v0.2d, #63
+ eor v18.16b, v18.16b, v17.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor v18.16b, v18.16b, v1.16b
+ ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
+ ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr v18.2d, v0.2d, #1 // 2nd phase
+ eor v2.16b, v2.16b,v0.16b
+ eor v0.16b, v0.16b,v18.16b //
+ ushr v18.2d, v18.2d, #6
+ ushr v0.2d, v0.2d, #1 //
+ eor v0.16b, v0.16b, v2.16b //
+ eor v0.16b, v0.16b, v18.16b //
+
+ subs x3, x3, #16
+ bne Loop_neon
+
+ rev64 v0.16b, v0.16b // byteswap Xi and write
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v0.16b}, [x0]
+
+ ret
+
+
+.section __TEXT,__const
+.align 4
+Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/ghash-neon-armv8-linux.S b/gen/bcm/ghash-neon-armv8-linux.S
new file mode 100644
index 0000000..6203bc6
--- /dev/null
+++ b/gen/bcm/ghash-neon-armv8-linux.S
@@ -0,0 +1,335 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl gcm_init_neon
+.hidden gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ AARCH64_VALID_CALL_TARGET
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {v17.2d}, [x1] // load H
+ movi v19.16b, #0xe1
+ shl v19.2d, v19.2d, #57 // 0xc2.0
+ ext v3.16b, v17.16b, v17.16b, #8
+ ushr v18.2d, v19.2d, #63
+ dup v17.4s, v17.s[1]
+ ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
+ ushr v18.2d, v3.2d, #63
+ sshr v17.4s, v17.4s, #31 // broadcast carry bit
+ and v18.16b, v18.16b, v16.16b
+ shl v3.2d, v3.2d, #1
+ ext v18.16b, v18.16b, v18.16b, #8
+ and v16.16b, v16.16b, v17.16b
+ orr v3.16b, v3.16b, v18.16b // H<<<=1
+ eor v5.16b, v3.16b, v16.16b // twisted H
+ st1 {v5.2d}, [x0] // store Htable[0]
+ ret
+.size gcm_init_neon,.-gcm_init_neon
+
+.globl gcm_gmult_neon
+.hidden gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, .Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v3.16b, v3.16b // byteswap Xi
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+ mov x3, #16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl gcm_ghash_neon
+.hidden gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, .Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v0.16b, v0.16b // byteswap Xi
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+.Loop_neon:
+ ld1 {v3.16b}, [x2], #16 // load inp
+ rev64 v3.16b, v3.16b // byteswap inp
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v3.16b, v3.16b, v0.16b // inp ^= Xi
+
+.Lgmult_neon:
+ // Split the input into v3 and v4. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins v4.d[0], v3.d[1]
+ ext v16.8b, v5.8b, v5.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v0.8b, v3.8b, v3.8b, #1 // B1
+ pmull v0.8h, v5.8b, v0.8b // E = A*B1
+ ext v17.8b, v5.8b, v5.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v5.8b, v19.8b // G = A*B2
+ ext v18.8b, v5.8b, v5.8b, #3 // A3
+ eor v16.16b, v16.16b, v0.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v0.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v0.8h, v5.8b, v0.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v0.16b // N = I + J
+ pmull v19.8h, v5.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v0.8h, v5.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v0.16b, v0.16b, v18.16b
+ eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
+ ext v16.8b, v7.8b, v7.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v1.8b, v3.8b, v3.8b, #1 // B1
+ pmull v1.8h, v7.8b, v1.8b // E = A*B1
+ ext v17.8b, v7.8b, v7.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v7.8b, v19.8b // G = A*B2
+ ext v18.8b, v7.8b, v7.8b, #3 // A3
+ eor v16.16b, v16.16b, v1.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v1.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v1.8h, v7.8b, v1.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v1.16b // N = I + J
+ pmull v19.8h, v7.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v1.8h, v7.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v1.16b, v1.16b, v18.16b
+ ext v16.8b, v6.8b, v6.8b, #1 // A1
+ pmull v16.8h, v16.8b, v4.8b // F = A1*B
+ ext v2.8b, v4.8b, v4.8b, #1 // B1
+ pmull v2.8h, v6.8b, v2.8b // E = A*B1
+ ext v17.8b, v6.8b, v6.8b, #2 // A2
+ pmull v17.8h, v17.8b, v4.8b // H = A2*B
+ ext v19.8b, v4.8b, v4.8b, #2 // B2
+ pmull v19.8h, v6.8b, v19.8b // G = A*B2
+ ext v18.8b, v6.8b, v6.8b, #3 // A3
+ eor v16.16b, v16.16b, v2.16b // L = E + F
+ pmull v18.8h, v18.8b, v4.8b // J = A3*B
+ ext v2.8b, v4.8b, v4.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v2.8h, v6.8b, v2.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v4.8b, v4.8b, #4 // B4
+ eor v18.16b, v18.16b, v2.16b // N = I + J
+ pmull v19.8h, v6.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v2.8h, v6.8b, v4.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v2.16b, v2.16b, v16.16b
+ eor v2.16b, v2.16b, v18.16b
+ ext v16.16b, v0.16b, v2.16b, #8
+ eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
+ ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins v2.d[0], v1.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl v17.2d, v0.2d, #57 // 1st phase
+ shl v18.2d, v0.2d, #62
+ eor v18.16b, v18.16b, v17.16b //
+ shl v17.2d, v0.2d, #63
+ eor v18.16b, v18.16b, v17.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor v18.16b, v18.16b, v1.16b
+ ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
+ ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr v18.2d, v0.2d, #1 // 2nd phase
+ eor v2.16b, v2.16b,v0.16b
+ eor v0.16b, v0.16b,v18.16b //
+ ushr v18.2d, v18.2d, #6
+ ushr v0.2d, v0.2d, #1 //
+ eor v0.16b, v0.16b, v2.16b //
+ eor v0.16b, v0.16b, v18.16b //
+
+ subs x3, x3, #16
+ bne .Loop_neon
+
+ rev64 v0.16b, v0.16b // byteswap Xi and write
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v0.16b}, [x0]
+
+ ret
+.size gcm_ghash_neon,.-gcm_ghash_neon
+
+.section .rodata
+.align 4
+.Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/ghash-neon-armv8-win.S b/gen/bcm/ghash-neon-armv8-win.S
new file mode 100644
index 0000000..d968893
--- /dev/null
+++ b/gen/bcm/ghash-neon-armv8-win.S
@@ -0,0 +1,341 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl gcm_init_neon
+
+.def gcm_init_neon
+ .type 32
+.endef
+.align 4
+gcm_init_neon:
+ AARCH64_VALID_CALL_TARGET
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {v17.2d}, [x1] // load H
+ movi v19.16b, #0xe1
+ shl v19.2d, v19.2d, #57 // 0xc2.0
+ ext v3.16b, v17.16b, v17.16b, #8
+ ushr v18.2d, v19.2d, #63
+ dup v17.4s, v17.s[1]
+ ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
+ ushr v18.2d, v3.2d, #63
+ sshr v17.4s, v17.4s, #31 // broadcast carry bit
+ and v18.16b, v18.16b, v16.16b
+ shl v3.2d, v3.2d, #1
+ ext v18.16b, v18.16b, v18.16b, #8
+ and v16.16b, v16.16b, v17.16b
+ orr v3.16b, v3.16b, v18.16b // H<<<=1
+ eor v5.16b, v3.16b, v16.16b // twisted H
+ st1 {v5.2d}, [x0] // store Htable[0]
+ ret
+
+
+.globl gcm_gmult_neon
+
+.def gcm_gmult_neon
+ .type 32
+.endef
+.align 4
+gcm_gmult_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks // load constants
+ add x9, x9, :lo12:Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v3.16b, v3.16b // byteswap Xi
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+ mov x3, #16
+ b Lgmult_neon
+
+
+.globl gcm_ghash_neon
+
+.def gcm_ghash_neon
+ .type 32
+.endef
+.align 4
+gcm_ghash_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks // load constants
+ add x9, x9, :lo12:Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v0.16b, v0.16b // byteswap Xi
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+Loop_neon:
+ ld1 {v3.16b}, [x2], #16 // load inp
+ rev64 v3.16b, v3.16b // byteswap inp
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v3.16b, v3.16b, v0.16b // inp ^= Xi
+
+Lgmult_neon:
+ // Split the input into v3 and v4. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins v4.d[0], v3.d[1]
+ ext v16.8b, v5.8b, v5.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v0.8b, v3.8b, v3.8b, #1 // B1
+ pmull v0.8h, v5.8b, v0.8b // E = A*B1
+ ext v17.8b, v5.8b, v5.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v5.8b, v19.8b // G = A*B2
+ ext v18.8b, v5.8b, v5.8b, #3 // A3
+ eor v16.16b, v16.16b, v0.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v0.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v0.8h, v5.8b, v0.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v0.16b // N = I + J
+ pmull v19.8h, v5.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v0.8h, v5.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v0.16b, v0.16b, v18.16b
+ eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
+ ext v16.8b, v7.8b, v7.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v1.8b, v3.8b, v3.8b, #1 // B1
+ pmull v1.8h, v7.8b, v1.8b // E = A*B1
+ ext v17.8b, v7.8b, v7.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v7.8b, v19.8b // G = A*B2
+ ext v18.8b, v7.8b, v7.8b, #3 // A3
+ eor v16.16b, v16.16b, v1.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v1.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v1.8h, v7.8b, v1.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v1.16b // N = I + J
+ pmull v19.8h, v7.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v1.8h, v7.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v1.16b, v1.16b, v18.16b
+ ext v16.8b, v6.8b, v6.8b, #1 // A1
+ pmull v16.8h, v16.8b, v4.8b // F = A1*B
+ ext v2.8b, v4.8b, v4.8b, #1 // B1
+ pmull v2.8h, v6.8b, v2.8b // E = A*B1
+ ext v17.8b, v6.8b, v6.8b, #2 // A2
+ pmull v17.8h, v17.8b, v4.8b // H = A2*B
+ ext v19.8b, v4.8b, v4.8b, #2 // B2
+ pmull v19.8h, v6.8b, v19.8b // G = A*B2
+ ext v18.8b, v6.8b, v6.8b, #3 // A3
+ eor v16.16b, v16.16b, v2.16b // L = E + F
+ pmull v18.8h, v18.8b, v4.8b // J = A3*B
+ ext v2.8b, v4.8b, v4.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v2.8h, v6.8b, v2.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v4.8b, v4.8b, #4 // B4
+ eor v18.16b, v18.16b, v2.16b // N = I + J
+ pmull v19.8h, v6.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v2.8h, v6.8b, v4.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v2.16b, v2.16b, v16.16b
+ eor v2.16b, v2.16b, v18.16b
+ ext v16.16b, v0.16b, v2.16b, #8
+ eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
+ ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins v2.d[0], v1.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl v17.2d, v0.2d, #57 // 1st phase
+ shl v18.2d, v0.2d, #62
+ eor v18.16b, v18.16b, v17.16b //
+ shl v17.2d, v0.2d, #63
+ eor v18.16b, v18.16b, v17.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor v18.16b, v18.16b, v1.16b
+ ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
+ ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr v18.2d, v0.2d, #1 // 2nd phase
+ eor v2.16b, v2.16b,v0.16b
+ eor v0.16b, v0.16b,v18.16b //
+ ushr v18.2d, v18.2d, #6
+ ushr v0.2d, v0.2d, #1 //
+ eor v0.16b, v0.16b, v2.16b //
+ eor v0.16b, v0.16b, v18.16b //
+
+ subs x3, x3, #16
+ bne Loop_neon
+
+ rev64 v0.16b, v0.16b // byteswap Xi and write
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v0.16b}, [x0]
+
+ ret
+
+
+.section .rodata
+.align 4
+Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/ghash-ssse3-x86-apple.S b/gen/bcm/ghash-ssse3-x86-apple.S
new file mode 100644
index 0000000..24b1f2f
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86-apple.S
@@ -0,0 +1,288 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _gcm_gmult_ssse3
+.private_extern _gcm_gmult_ssse3
+.align 4
+_gcm_gmult_ssse3:
+L_gcm_gmult_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movdqu (%edi),%xmm0
+ call L000pic_point
+L000pic_point:
+ popl %eax
+ movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7
+ movdqa Llow4_mask-L000pic_point(%eax),%xmm2
+.byte 102,15,56,0,199
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+L001loop_row_1:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz L001loop_row_1
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+L002loop_row_2:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz L002loop_row_2
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $6,%eax
+L003loop_row_3:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz L003loop_row_3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,0,215
+ movdqu %xmm2,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _gcm_ghash_ssse3
+.private_extern _gcm_ghash_ssse3
+.align 4
+_gcm_ghash_ssse3:
+L_gcm_ghash_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ movdqu (%edi),%xmm0
+ call L004pic_point
+L004pic_point:
+ popl %ebx
+ movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7
+ andl $-16,%ecx
+.byte 102,15,56,0,199
+ pxor %xmm3,%xmm3
+L005loop_ghash:
+ movdqa Llow4_mask-L004pic_point(%ebx),%xmm2
+ movdqu (%edx),%xmm1
+.byte 102,15,56,0,207
+ pxor %xmm1,%xmm0
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ pxor %xmm2,%xmm2
+ movl $5,%eax
+L006loop_row_4:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz L006loop_row_4
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+L007loop_row_5:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz L007loop_row_5
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $6,%eax
+L008loop_row_6:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz L008loop_row_6
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movdqa %xmm2,%xmm0
+ leal -256(%esi),%esi
+ leal 16(%edx),%edx
+ subl $16,%ecx
+ jnz L005loop_ghash
+.byte 102,15,56,0,199
+ movdqu %xmm0,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 4,0x90
+Lreverse_bytes:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.align 4,0x90
+Llow4_mask:
+.long 252645135,252645135,252645135,252645135
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/ghash-ssse3-x86-linux.S b/gen/bcm/ghash-ssse3-x86-linux.S
new file mode 100644
index 0000000..445db3b
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86-linux.S
@@ -0,0 +1,292 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl gcm_gmult_ssse3
+.hidden gcm_gmult_ssse3
+.type gcm_gmult_ssse3,@function
+.align 16
+gcm_gmult_ssse3:
+.L_gcm_gmult_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movdqu (%edi),%xmm0
+ call .L000pic_point
+.L000pic_point:
+ popl %eax
+ movdqa .Lreverse_bytes-.L000pic_point(%eax),%xmm7
+ movdqa .Llow4_mask-.L000pic_point(%eax),%xmm2
+.byte 102,15,56,0,199
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+.L001loop_row_1:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L001loop_row_1
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+.L002loop_row_2:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L002loop_row_2
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $6,%eax
+.L003loop_row_3:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L003loop_row_3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,0,215
+ movdqu %xmm2,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin
+.globl gcm_ghash_ssse3
+.hidden gcm_ghash_ssse3
+.type gcm_ghash_ssse3,@function
+.align 16
+gcm_ghash_ssse3:
+.L_gcm_ghash_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ movdqu (%edi),%xmm0
+ call .L004pic_point
+.L004pic_point:
+ popl %ebx
+ movdqa .Lreverse_bytes-.L004pic_point(%ebx),%xmm7
+ andl $-16,%ecx
+.byte 102,15,56,0,199
+ pxor %xmm3,%xmm3
+.L005loop_ghash:
+ movdqa .Llow4_mask-.L004pic_point(%ebx),%xmm2
+ movdqu (%edx),%xmm1
+.byte 102,15,56,0,207
+ pxor %xmm1,%xmm0
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ pxor %xmm2,%xmm2
+ movl $5,%eax
+.L006loop_row_4:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L006loop_row_4
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+.L007loop_row_5:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L007loop_row_5
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $6,%eax
+.L008loop_row_6:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L008loop_row_6
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movdqa %xmm2,%xmm0
+ leal -256(%esi),%esi
+ leal 16(%edx),%edx
+ subl $16,%ecx
+ jnz .L005loop_ghash
+.byte 102,15,56,0,199
+ movdqu %xmm0,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin
+.align 16
+.Lreverse_bytes:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.align 16
+.Llow4_mask:
+.long 252645135,252645135,252645135,252645135
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/ghash-ssse3-x86-win.asm b/gen/bcm/ghash-ssse3-x86-win.asm
new file mode 100644
index 0000000..52108aa
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86-win.asm
@@ -0,0 +1,297 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _gcm_gmult_ssse3
+align 16
+_gcm_gmult_ssse3:
+L$_gcm_gmult_ssse3_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov edi,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ movdqu xmm0,[edi]
+ call L$000pic_point
+L$000pic_point:
+ pop eax
+ movdqa xmm7,[(L$reverse_bytes-L$000pic_point)+eax]
+ movdqa xmm2,[(L$low4_mask-L$000pic_point)+eax]
+db 102,15,56,0,199
+ movdqa xmm1,xmm2
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm2
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ mov eax,5
+L$001loop_row_1:
+ movdqa xmm4,[esi]
+ lea esi,[16+esi]
+ movdqa xmm6,xmm2
+db 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+ movdqa xmm5,xmm4
+db 102,15,56,0,224
+db 102,15,56,0,233
+ pxor xmm2,xmm5
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+ sub eax,1
+ jnz NEAR L$001loop_row_1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov eax,5
+L$002loop_row_2:
+ movdqa xmm4,[esi]
+ lea esi,[16+esi]
+ movdqa xmm6,xmm2
+db 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+ movdqa xmm5,xmm4
+db 102,15,56,0,224
+db 102,15,56,0,233
+ pxor xmm2,xmm5
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+ sub eax,1
+ jnz NEAR L$002loop_row_2
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov eax,6
+L$003loop_row_3:
+ movdqa xmm4,[esi]
+ lea esi,[16+esi]
+ movdqa xmm6,xmm2
+db 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+ movdqa xmm5,xmm4
+db 102,15,56,0,224
+db 102,15,56,0,233
+ pxor xmm2,xmm5
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+ sub eax,1
+ jnz NEAR L$003loop_row_3
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+db 102,15,56,0,215
+ movdqu [edi],xmm2
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _gcm_ghash_ssse3
+align 16
+_gcm_ghash_ssse3:
+L$_gcm_ghash_ssse3_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov edi,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ mov edx,DWORD [28+esp]
+ mov ecx,DWORD [32+esp]
+ movdqu xmm0,[edi]
+ call L$004pic_point
+L$004pic_point:
+ pop ebx
+ movdqa xmm7,[(L$reverse_bytes-L$004pic_point)+ebx]
+ and ecx,-16
+db 102,15,56,0,199
+ pxor xmm3,xmm3
+L$005loop_ghash:
+ movdqa xmm2,[(L$low4_mask-L$004pic_point)+ebx]
+ movdqu xmm1,[edx]
+db 102,15,56,0,207
+ pxor xmm0,xmm1
+ movdqa xmm1,xmm2
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm2
+ pxor xmm2,xmm2
+ mov eax,5
+L$006loop_row_4:
+ movdqa xmm4,[esi]
+ lea esi,[16+esi]
+ movdqa xmm6,xmm2
+db 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+ movdqa xmm5,xmm4
+db 102,15,56,0,224
+db 102,15,56,0,233
+ pxor xmm2,xmm5
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+ sub eax,1
+ jnz NEAR L$006loop_row_4
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov eax,5
+L$007loop_row_5:
+ movdqa xmm4,[esi]
+ lea esi,[16+esi]
+ movdqa xmm6,xmm2
+db 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+ movdqa xmm5,xmm4
+db 102,15,56,0,224
+db 102,15,56,0,233
+ pxor xmm2,xmm5
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+ sub eax,1
+ jnz NEAR L$007loop_row_5
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov eax,6
+L$008loop_row_6:
+ movdqa xmm4,[esi]
+ lea esi,[16+esi]
+ movdqa xmm6,xmm2
+db 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+ movdqa xmm5,xmm4
+db 102,15,56,0,224
+db 102,15,56,0,233
+ pxor xmm2,xmm5
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+ sub eax,1
+ jnz NEAR L$008loop_row_6
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ movdqa xmm0,xmm2
+ lea esi,[esi-256]
+ lea edx,[16+edx]
+ sub ecx,16
+ jnz NEAR L$005loop_ghash
+db 102,15,56,0,199
+ movdqu [edi],xmm0
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 16
+L$reverse_bytes:
+db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+align 16
+L$low4_mask:
+dd 252645135,252645135,252645135,252645135
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-ssse3-x86_64-apple.S b/gen/bcm/ghash-ssse3-x86_64-apple.S
new file mode 100644
index 0000000..bcbf824
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86_64-apple.S
@@ -0,0 +1,423 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+
+
+
+
+.globl _gcm_gmult_ssse3
+.private_extern _gcm_gmult_ssse3
+.p2align 4
+_gcm_gmult_ssse3:
+
+
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ movdqa L$reverse_bytes(%rip),%xmm10
+ movdqa L$low4_mask(%rip),%xmm2
+
+
+.byte 102,65,15,56,0,194
+
+
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+
+
+
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+L$oop_row_1:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz L$oop_row_1
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+L$oop_row_2:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz L$oop_row_2
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $6,%rax
+L$oop_row_3:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz L$oop_row_3
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+
+.byte 102,65,15,56,0,210
+ movdqu %xmm2,(%rdi)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ ret
+
+
+
+
+
+
+
+
+
+.globl _gcm_ghash_ssse3
+.private_extern _gcm_ghash_ssse3
+.p2align 4
+_gcm_ghash_ssse3:
+
+
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ movdqa L$reverse_bytes(%rip),%xmm10
+ movdqa L$low4_mask(%rip),%xmm11
+
+
+ andq $-16,%rcx
+
+
+
+.byte 102,65,15,56,0,194
+
+
+ pxor %xmm3,%xmm3
+L$oop_ghash:
+
+ movdqu (%rdx),%xmm1
+.byte 102,65,15,56,0,202
+ pxor %xmm1,%xmm0
+
+
+ movdqa %xmm11,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm11,%xmm0
+
+
+
+
+ pxor %xmm2,%xmm2
+
+ movq $5,%rax
+L$oop_row_4:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz L$oop_row_4
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+L$oop_row_5:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz L$oop_row_5
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $6,%rax
+L$oop_row_6:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz L$oop_row_6
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movdqa %xmm2,%xmm0
+
+
+ leaq -256(%rsi),%rsi
+
+
+ leaq 16(%rdx),%rdx
+ subq $16,%rcx
+ jnz L$oop_ghash
+
+
+.byte 102,65,15,56,0,194
+ movdqu %xmm0,(%rdi)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ ret
+
+
+
+
+.section __DATA,__const
+.p2align 4
+
+
+L$reverse_bytes:
+.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+L$low4_mask:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.text
+#endif
diff --git a/gen/bcm/ghash-ssse3-x86_64-linux.S b/gen/bcm/ghash-ssse3-x86_64-linux.S
new file mode 100644
index 0000000..2acb448
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86_64-linux.S
@@ -0,0 +1,423 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+
+
+
+
+.type gcm_gmult_ssse3, @function
+.globl gcm_gmult_ssse3
+.hidden gcm_gmult_ssse3
+.align 16
+gcm_gmult_ssse3:
+.cfi_startproc
+
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ movdqa .Lreverse_bytes(%rip),%xmm10
+ movdqa .Llow4_mask(%rip),%xmm2
+
+
+.byte 102,65,15,56,0,194
+
+
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+
+
+
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+.Loop_row_1:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_1
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+.Loop_row_2:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_2
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $6,%rax
+.Loop_row_3:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_3
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+
+.byte 102,65,15,56,0,210
+ movdqu %xmm2,(%rdi)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ ret
+.cfi_endproc
+
+.size gcm_gmult_ssse3,.-gcm_gmult_ssse3
+
+
+
+
+
+.type gcm_ghash_ssse3, @function
+.globl gcm_ghash_ssse3
+.hidden gcm_ghash_ssse3
+.align 16
+gcm_ghash_ssse3:
+.cfi_startproc
+
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ movdqa .Lreverse_bytes(%rip),%xmm10
+ movdqa .Llow4_mask(%rip),%xmm11
+
+
+ andq $-16,%rcx
+
+
+
+.byte 102,65,15,56,0,194
+
+
+ pxor %xmm3,%xmm3
+.Loop_ghash:
+
+ movdqu (%rdx),%xmm1
+.byte 102,65,15,56,0,202
+ pxor %xmm1,%xmm0
+
+
+ movdqa %xmm11,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm11,%xmm0
+
+
+
+
+ pxor %xmm2,%xmm2
+
+ movq $5,%rax
+.Loop_row_4:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_4
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+.Loop_row_5:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_5
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $6,%rax
+.Loop_row_6:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_6
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movdqa %xmm2,%xmm0
+
+
+ leaq -256(%rsi),%rsi
+
+
+ leaq 16(%rdx),%rdx
+ subq $16,%rcx
+ jnz .Loop_ghash
+
+
+.byte 102,65,15,56,0,194
+ movdqu %xmm0,(%rdi)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ ret
+.cfi_endproc
+
+.size gcm_ghash_ssse3,.-gcm_ghash_ssse3
+
+.section .rodata
+.align 16
+
+
+.Lreverse_bytes:
+.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.Llow4_mask:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.text
+#endif
diff --git a/gen/bcm/ghash-ssse3-x86_64-win.asm b/gen/bcm/ghash-ssse3-x86_64-win.asm
new file mode 100644
index 0000000..84c5d40
--- /dev/null
+++ b/gen/bcm/ghash-ssse3-x86_64-win.asm
@@ -0,0 +1,497 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+
+
+
+
+
+global gcm_gmult_ssse3
+ALIGN 16
+gcm_gmult_ssse3:
+
+$L$SEH_begin_gcm_gmult_ssse3_1:
+_CET_ENDBR
+ sub rsp,40
+$L$SEH_prolog_gcm_gmult_ssse3_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_gmult_ssse3_3:
+ movdqa XMMWORD[16+rsp],xmm10
+$L$SEH_prolog_gcm_gmult_ssse3_4:
+ movdqu xmm0,XMMWORD[rcx]
+ movdqa xmm10,XMMWORD[$L$reverse_bytes]
+ movdqa xmm2,XMMWORD[$L$low4_mask]
+
+
+DB 102,65,15,56,0,194
+
+
+ movdqa xmm1,xmm2
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm2
+
+
+
+
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ mov rax,5
+$L$oop_row_1:
+ movdqa xmm4,XMMWORD[rdx]
+ lea rdx,[16+rdx]
+
+
+ movdqa xmm6,xmm2
+DB 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+
+
+
+
+ movdqa xmm5,xmm4
+DB 102,15,56,0,224
+DB 102,15,56,0,233
+
+
+ pxor xmm2,xmm5
+
+
+
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+
+
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+
+ sub rax,1
+ jnz NEAR $L$oop_row_1
+
+
+
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov rax,5
+$L$oop_row_2:
+ movdqa xmm4,XMMWORD[rdx]
+ lea rdx,[16+rdx]
+
+
+ movdqa xmm6,xmm2
+DB 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+
+
+
+
+ movdqa xmm5,xmm4
+DB 102,15,56,0,224
+DB 102,15,56,0,233
+
+
+ pxor xmm2,xmm5
+
+
+
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+
+
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+
+ sub rax,1
+ jnz NEAR $L$oop_row_2
+
+
+
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov rax,6
+$L$oop_row_3:
+ movdqa xmm4,XMMWORD[rdx]
+ lea rdx,[16+rdx]
+
+
+ movdqa xmm6,xmm2
+DB 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+
+
+
+
+ movdqa xmm5,xmm4
+DB 102,15,56,0,224
+DB 102,15,56,0,233
+
+
+ pxor xmm2,xmm5
+
+
+
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+
+
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+
+ sub rax,1
+ jnz NEAR $L$oop_row_3
+
+
+
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+
+DB 102,65,15,56,0,210
+ movdqu XMMWORD[rcx],xmm2
+
+
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm10,XMMWORD[16+rsp]
+ add rsp,40
+ ret
+
+$L$SEH_end_gcm_gmult_ssse3_5:
+
+
+
+
+
+
+
+global gcm_ghash_ssse3
+ALIGN 16
+gcm_ghash_ssse3:
+
+$L$SEH_begin_gcm_ghash_ssse3_1:
+_CET_ENDBR
+ sub rsp,56
+$L$SEH_prolog_gcm_ghash_ssse3_2:
+ movdqa XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_ghash_ssse3_3:
+ movdqa XMMWORD[16+rsp],xmm10
+$L$SEH_prolog_gcm_ghash_ssse3_4:
+ movdqa XMMWORD[32+rsp],xmm11
+$L$SEH_prolog_gcm_ghash_ssse3_5:
+ movdqu xmm0,XMMWORD[rcx]
+ movdqa xmm10,XMMWORD[$L$reverse_bytes]
+ movdqa xmm11,XMMWORD[$L$low4_mask]
+
+
+ and r9,-16
+
+
+
+DB 102,65,15,56,0,194
+
+
+ pxor xmm3,xmm3
+$L$oop_ghash:
+
+ movdqu xmm1,XMMWORD[r8]
+DB 102,65,15,56,0,202
+ pxor xmm0,xmm1
+
+
+ movdqa xmm1,xmm11
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm11
+
+
+
+
+ pxor xmm2,xmm2
+
+ mov rax,5
+$L$oop_row_4:
+ movdqa xmm4,XMMWORD[rdx]
+ lea rdx,[16+rdx]
+
+
+ movdqa xmm6,xmm2
+DB 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+
+
+
+
+ movdqa xmm5,xmm4
+DB 102,15,56,0,224
+DB 102,15,56,0,233
+
+
+ pxor xmm2,xmm5
+
+
+
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+
+
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+
+ sub rax,1
+ jnz NEAR $L$oop_row_4
+
+
+
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov rax,5
+$L$oop_row_5:
+ movdqa xmm4,XMMWORD[rdx]
+ lea rdx,[16+rdx]
+
+
+ movdqa xmm6,xmm2
+DB 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+
+
+
+
+ movdqa xmm5,xmm4
+DB 102,15,56,0,224
+DB 102,15,56,0,233
+
+
+ pxor xmm2,xmm5
+
+
+
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+
+
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+
+ sub rax,1
+ jnz NEAR $L$oop_row_5
+
+
+
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ mov rax,6
+$L$oop_row_6:
+ movdqa xmm4,XMMWORD[rdx]
+ lea rdx,[16+rdx]
+
+
+ movdqa xmm6,xmm2
+DB 102,15,58,15,243,1
+ movdqa xmm3,xmm6
+ psrldq xmm2,1
+
+
+
+
+ movdqa xmm5,xmm4
+DB 102,15,56,0,224
+DB 102,15,56,0,233
+
+
+ pxor xmm2,xmm5
+
+
+
+ movdqa xmm5,xmm4
+ psllq xmm5,60
+ movdqa xmm6,xmm5
+ pslldq xmm6,8
+ pxor xmm3,xmm6
+
+
+ psrldq xmm5,8
+ pxor xmm2,xmm5
+ psrlq xmm4,4
+ pxor xmm2,xmm4
+
+ sub rax,1
+ jnz NEAR $L$oop_row_6
+
+
+
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,1
+ pxor xmm2,xmm3
+ psrlq xmm3,5
+ pxor xmm2,xmm3
+ pxor xmm3,xmm3
+ movdqa xmm0,xmm2
+
+
+ lea rdx,[((-256))+rdx]
+
+
+ lea r8,[16+r8]
+ sub r9,16
+ jnz NEAR $L$oop_ghash
+
+
+DB 102,65,15,56,0,194
+ movdqu XMMWORD[rcx],xmm0
+
+
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ movdqa xmm6,XMMWORD[rsp]
+ movdqa xmm10,XMMWORD[16+rsp]
+ movdqa xmm11,XMMWORD[32+rsp]
+ add rsp,56
+ ret
+
+$L$SEH_end_gcm_ghash_ssse3_6:
+
+
+section .rdata rdata align=8
+ALIGN 16
+
+
+$L$reverse_bytes:
+ DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+
+$L$low4_mask:
+ DQ 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
+section .text
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_gcm_gmult_ssse3_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_gmult_ssse3_5 wrt ..imagebase
+ DD $L$SEH_info_gcm_gmult_ssse3_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_ghash_ssse3_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_ghash_ssse3_6 wrt ..imagebase
+ DD $L$SEH_info_gcm_ghash_ssse3_0 wrt ..imagebase
+
+
+section .xdata rdata align=8
+ALIGN 4
+$L$SEH_info_gcm_gmult_ssse3_0:
+ DB 1
+ DB $L$SEH_prolog_gcm_gmult_ssse3_4-$L$SEH_begin_gcm_gmult_ssse3_1
+ DB 5
+ DB 0
+ DB $L$SEH_prolog_gcm_gmult_ssse3_4-$L$SEH_begin_gcm_gmult_ssse3_1
+ DB 168
+ DW 1
+ DB $L$SEH_prolog_gcm_gmult_ssse3_3-$L$SEH_begin_gcm_gmult_ssse3_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_gcm_gmult_ssse3_2-$L$SEH_begin_gcm_gmult_ssse3_1
+ DB 66
+
+$L$SEH_info_gcm_ghash_ssse3_0:
+ DB 1
+ DB $L$SEH_prolog_gcm_ghash_ssse3_5-$L$SEH_begin_gcm_ghash_ssse3_1
+ DB 7
+ DB 0
+ DB $L$SEH_prolog_gcm_ghash_ssse3_5-$L$SEH_begin_gcm_ghash_ssse3_1
+ DB 184
+ DW 2
+ DB $L$SEH_prolog_gcm_ghash_ssse3_4-$L$SEH_begin_gcm_ghash_ssse3_1
+ DB 168
+ DW 1
+ DB $L$SEH_prolog_gcm_ghash_ssse3_3-$L$SEH_begin_gcm_ghash_ssse3_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_gcm_ghash_ssse3_2-$L$SEH_begin_gcm_ghash_ssse3_1
+ DB 98
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-x86-apple.S b/gen/bcm/ghash-x86-apple.S
new file mode 100644
index 0000000..a178b74
--- /dev/null
+++ b/gen/bcm/ghash-x86-apple.S
@@ -0,0 +1,322 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _gcm_init_clmul
+.private_extern _gcm_init_clmul
+.align 4
+_gcm_init_clmul:
+L_gcm_init_clmul_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ call L000pic
+L000pic:
+ popl %ecx
+ leal Lbswap-L000pic(%ecx),%ecx
+ movdqu (%eax),%xmm2
+ pshufd $78,%xmm2,%xmm2
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+ pand 16(%ecx),%xmm5
+ pxor %xmm5,%xmm2
+ movdqa %xmm2,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,(%edx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%edx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%edx)
+ ret
+.globl _gcm_gmult_clmul
+.private_extern _gcm_gmult_clmul
+.align 4
+_gcm_gmult_clmul:
+L_gcm_gmult_clmul_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ call L001pic
+L001pic:
+ popl %ecx
+ leal Lbswap-L001pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movups (%edx),%xmm2
+.byte 102,15,56,0,197
+ movups 32(%edx),%xmm4
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ ret
+.globl _gcm_ghash_clmul
+.private_extern _gcm_ghash_clmul
+.align 4
+_gcm_ghash_clmul:
+L_gcm_ghash_clmul_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%eax
+ movl 24(%esp),%edx
+ movl 28(%esp),%esi
+ movl 32(%esp),%ebx
+ call L002pic
+L002pic:
+ popl %ecx
+ leal Lbswap-L002pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movdqu (%edx),%xmm2
+.byte 102,15,56,0,197
+ subl $16,%ebx
+ jz L003odd_tail
+ movdqu (%esi),%xmm3
+ movdqu 16(%esi),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ movdqu 32(%edx),%xmm5
+ pxor %xmm3,%xmm0
+ pshufd $78,%xmm6,%xmm3
+ movdqa %xmm6,%xmm7
+ pxor %xmm6,%xmm3
+ leal 32(%esi),%esi
+.byte 102,15,58,68,242,0
+.byte 102,15,58,68,250,17
+.byte 102,15,58,68,221,0
+ movups 16(%edx),%xmm2
+ nop
+ subl $32,%ebx
+ jbe L004even_tail
+ jmp L005mod_loop
+.align 5,0x90
+L005mod_loop:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+ nop
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movups (%edx),%xmm2
+ xorps %xmm6,%xmm0
+ movdqa (%ecx),%xmm5
+ xorps %xmm7,%xmm1
+ movdqu (%esi),%xmm7
+ pxor %xmm0,%xmm3
+ movdqu 16(%esi),%xmm6
+ pxor %xmm1,%xmm3
+.byte 102,15,56,0,253
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+.byte 102,15,56,0,245
+ pxor %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+.byte 102,15,58,68,242,0
+ movups 32(%edx),%xmm5
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm7,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,15,58,68,250,17
+ movups 16(%edx),%xmm2
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,58,68,221,0
+ leal 32(%esi),%esi
+ subl $32,%ebx
+ ja L005mod_loop
+L004even_tail:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movdqa (%ecx),%xmm5
+ xorps %xmm6,%xmm0
+ xorps %xmm7,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testl %ebx,%ebx
+ jnz L006done
+ movups (%edx),%xmm2
+L003odd_tail:
+ movdqu (%esi),%xmm3
+.byte 102,15,56,0,221
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+L006done:
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 6,0x90
+Lbswap:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte 0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/ghash-x86-linux.S b/gen/bcm/ghash-x86-linux.S
new file mode 100644
index 0000000..c897efc
--- /dev/null
+++ b/gen/bcm/ghash-x86-linux.S
@@ -0,0 +1,328 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl gcm_init_clmul
+.hidden gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ call .L000pic
+.L000pic:
+ popl %ecx
+ leal .Lbswap-.L000pic(%ecx),%ecx
+ movdqu (%eax),%xmm2
+ pshufd $78,%xmm2,%xmm2
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+ pand 16(%ecx),%xmm5
+ pxor %xmm5,%xmm2
+ movdqa %xmm2,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,(%edx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%edx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%edx)
+ ret
+.size gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl gcm_gmult_clmul
+.hidden gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ call .L001pic
+.L001pic:
+ popl %ecx
+ leal .Lbswap-.L001pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movups (%edx),%xmm2
+.byte 102,15,56,0,197
+ movups 32(%edx),%xmm4
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ ret
+.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl gcm_ghash_clmul
+.hidden gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%eax
+ movl 24(%esp),%edx
+ movl 28(%esp),%esi
+ movl 32(%esp),%ebx
+ call .L002pic
+.L002pic:
+ popl %ecx
+ leal .Lbswap-.L002pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movdqu (%edx),%xmm2
+.byte 102,15,56,0,197
+ subl $16,%ebx
+ jz .L003odd_tail
+ movdqu (%esi),%xmm3
+ movdqu 16(%esi),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ movdqu 32(%edx),%xmm5
+ pxor %xmm3,%xmm0
+ pshufd $78,%xmm6,%xmm3
+ movdqa %xmm6,%xmm7
+ pxor %xmm6,%xmm3
+ leal 32(%esi),%esi
+.byte 102,15,58,68,242,0
+.byte 102,15,58,68,250,17
+.byte 102,15,58,68,221,0
+ movups 16(%edx),%xmm2
+ nop
+ subl $32,%ebx
+ jbe .L004even_tail
+ jmp .L005mod_loop
+.align 32
+.L005mod_loop:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+ nop
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movups (%edx),%xmm2
+ xorps %xmm6,%xmm0
+ movdqa (%ecx),%xmm5
+ xorps %xmm7,%xmm1
+ movdqu (%esi),%xmm7
+ pxor %xmm0,%xmm3
+ movdqu 16(%esi),%xmm6
+ pxor %xmm1,%xmm3
+.byte 102,15,56,0,253
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+.byte 102,15,56,0,245
+ pxor %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+.byte 102,15,58,68,242,0
+ movups 32(%edx),%xmm5
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm7,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,15,58,68,250,17
+ movups 16(%edx),%xmm2
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,58,68,221,0
+ leal 32(%esi),%esi
+ subl $32,%ebx
+ ja .L005mod_loop
+.L004even_tail:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movdqa (%ecx),%xmm5
+ xorps %xmm6,%xmm0
+ xorps %xmm7,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testl %ebx,%ebx
+ jnz .L006done
+ movups (%edx),%xmm2
+.L003odd_tail:
+ movdqu (%esi),%xmm3
+.byte 102,15,56,0,221
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.L006done:
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align 64
+.Lbswap:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte 0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/ghash-x86-win.asm b/gen/bcm/ghash-x86-win.asm
new file mode 100644
index 0000000..3f6c707
--- /dev/null
+++ b/gen/bcm/ghash-x86-win.asm
@@ -0,0 +1,330 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _gcm_init_clmul
+align 16
+_gcm_init_clmul:
+L$_gcm_init_clmul_begin:
+ mov edx,DWORD [4+esp]
+ mov eax,DWORD [8+esp]
+ call L$000pic
+L$000pic:
+ pop ecx
+ lea ecx,[(L$bswap-L$000pic)+ecx]
+ movdqu xmm2,[eax]
+ pshufd xmm2,xmm2,78
+ pshufd xmm4,xmm2,255
+ movdqa xmm3,xmm2
+ psllq xmm2,1
+ pxor xmm5,xmm5
+ psrlq xmm3,63
+ pcmpgtd xmm5,xmm4
+ pslldq xmm3,8
+ por xmm2,xmm3
+ pand xmm5,[16+ecx]
+ pxor xmm2,xmm5
+ movdqa xmm0,xmm2
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pshufd xmm4,xmm2,78
+ pxor xmm3,xmm0
+ pxor xmm4,xmm2
+db 102,15,58,68,194,0
+db 102,15,58,68,202,17
+db 102,15,58,68,220,0
+ xorps xmm3,xmm0
+ xorps xmm3,xmm1
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ pshufd xmm3,xmm2,78
+ pshufd xmm4,xmm0,78
+ pxor xmm3,xmm2
+ movdqu [edx],xmm2
+ pxor xmm4,xmm0
+ movdqu [16+edx],xmm0
+db 102,15,58,15,227,8
+ movdqu [32+edx],xmm4
+ ret
+global _gcm_gmult_clmul
+align 16
+_gcm_gmult_clmul:
+L$_gcm_gmult_clmul_begin:
+ mov eax,DWORD [4+esp]
+ mov edx,DWORD [8+esp]
+ call L$001pic
+L$001pic:
+ pop ecx
+ lea ecx,[(L$bswap-L$001pic)+ecx]
+ movdqu xmm0,[eax]
+ movdqa xmm5,[ecx]
+ movups xmm2,[edx]
+db 102,15,56,0,197
+ movups xmm4,[32+edx]
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pxor xmm3,xmm0
+db 102,15,58,68,194,0
+db 102,15,58,68,202,17
+db 102,15,58,68,220,0
+ xorps xmm3,xmm0
+ xorps xmm3,xmm1
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+db 102,15,56,0,197
+ movdqu [eax],xmm0
+ ret
+global _gcm_ghash_clmul
+align 16
+_gcm_ghash_clmul:
+L$_gcm_ghash_clmul_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov eax,DWORD [20+esp]
+ mov edx,DWORD [24+esp]
+ mov esi,DWORD [28+esp]
+ mov ebx,DWORD [32+esp]
+ call L$002pic
+L$002pic:
+ pop ecx
+ lea ecx,[(L$bswap-L$002pic)+ecx]
+ movdqu xmm0,[eax]
+ movdqa xmm5,[ecx]
+ movdqu xmm2,[edx]
+db 102,15,56,0,197
+ sub ebx,16
+ jz NEAR L$003odd_tail
+ movdqu xmm3,[esi]
+ movdqu xmm6,[16+esi]
+db 102,15,56,0,221
+db 102,15,56,0,245
+ movdqu xmm5,[32+edx]
+ pxor xmm0,xmm3
+ pshufd xmm3,xmm6,78
+ movdqa xmm7,xmm6
+ pxor xmm3,xmm6
+ lea esi,[32+esi]
+db 102,15,58,68,242,0
+db 102,15,58,68,250,17
+db 102,15,58,68,221,0
+ movups xmm2,[16+edx]
+ nop
+ sub ebx,32
+ jbe NEAR L$004even_tail
+ jmp NEAR L$005mod_loop
+align 32
+L$005mod_loop:
+ pshufd xmm4,xmm0,78
+ movdqa xmm1,xmm0
+ pxor xmm4,xmm0
+ nop
+db 102,15,58,68,194,0
+db 102,15,58,68,202,17
+db 102,15,58,68,229,16
+ movups xmm2,[edx]
+ xorps xmm0,xmm6
+ movdqa xmm5,[ecx]
+ xorps xmm1,xmm7
+ movdqu xmm7,[esi]
+ pxor xmm3,xmm0
+ movdqu xmm6,[16+esi]
+ pxor xmm3,xmm1
+db 102,15,56,0,253
+ pxor xmm4,xmm3
+ movdqa xmm3,xmm4
+ psrldq xmm4,8
+ pslldq xmm3,8
+ pxor xmm1,xmm4
+ pxor xmm0,xmm3
+db 102,15,56,0,245
+ pxor xmm1,xmm7
+ movdqa xmm7,xmm6
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+db 102,15,58,68,242,0
+ movups xmm5,[32+edx]
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+ pshufd xmm3,xmm7,78
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm3,xmm7
+ pxor xmm1,xmm4
+db 102,15,58,68,250,17
+ movups xmm2,[16+edx]
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+db 102,15,58,68,221,0
+ lea esi,[32+esi]
+ sub ebx,32
+ ja NEAR L$005mod_loop
+L$004even_tail:
+ pshufd xmm4,xmm0,78
+ movdqa xmm1,xmm0
+ pxor xmm4,xmm0
+db 102,15,58,68,194,0
+db 102,15,58,68,202,17
+db 102,15,58,68,229,16
+ movdqa xmm5,[ecx]
+ xorps xmm0,xmm6
+ xorps xmm1,xmm7
+ pxor xmm3,xmm0
+ pxor xmm3,xmm1
+ pxor xmm4,xmm3
+ movdqa xmm3,xmm4
+ psrldq xmm4,8
+ pslldq xmm3,8
+ pxor xmm1,xmm4
+ pxor xmm0,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ test ebx,ebx
+ jnz NEAR L$006done
+ movups xmm2,[edx]
+L$003odd_tail:
+ movdqu xmm3,[esi]
+db 102,15,56,0,221
+ pxor xmm0,xmm3
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pshufd xmm4,xmm2,78
+ pxor xmm3,xmm0
+ pxor xmm4,xmm2
+db 102,15,58,68,194,0
+db 102,15,58,68,202,17
+db 102,15,58,68,220,0
+ xorps xmm3,xmm0
+ xorps xmm3,xmm1
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+L$006done:
+db 102,15,56,0,197
+ movdqu [eax],xmm0
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 64
+L$bswap:
+db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+db 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+db 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+db 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+db 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+db 0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghash-x86_64-apple.S b/gen/bcm/ghash-x86_64-apple.S
new file mode 100644
index 0000000..909d659
--- /dev/null
+++ b/gen/bcm/ghash-x86_64-apple.S
@@ -0,0 +1,1125 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+.globl _gcm_init_clmul
+.private_extern _gcm_init_clmul
+
+.p2align 4
+_gcm_init_clmul:
+
+
+_CET_ENDBR
+L$_init_clmul:
+ movdqu (%rsi),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+
+
+ pand L$0x1c2_polynomial(%rip),%xmm5
+ pxor %xmm5,%xmm2
+
+
+ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
+ ret
+
+
+
+.globl _gcm_gmult_clmul
+.private_extern _gcm_gmult_clmul
+
+.p2align 4
+_gcm_gmult_clmul:
+
+_CET_ENDBR
+L$_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa L$bswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ ret
+
+
+.globl _gcm_ghash_clmul
+.private_extern _gcm_ghash_clmul
+
+.p2align 5
+_gcm_ghash_clmul:
+
+
+_CET_ENDBR
+L$_ghash_clmul:
+ movdqa L$bswap_mask(%rip),%xmm10
+
+ movdqu (%rdi),%xmm0
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
+
+ subq $0x10,%rcx
+ jz L$odd_tail
+
+ movdqu 16(%rsi),%xmm6
+ cmpq $0x30,%rcx
+ jb L$skip4x
+
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
+
+
+
+
+ movdqu 48(%rdx),%xmm3
+ movdqu 32(%rdx),%xmm11
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+.byte 102,69,15,58,68,238,17
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $0x40,%rcx
+ jc L$tail4x
+
+ jmp L$mod4_loop
+.p2align 5
+L$mod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+ movdqu 48(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
+ movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
+ pshufd $78,%xmm11,%xmm12
+ xorps %xmm5,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
+.byte 102,68,15,58,68,218,0
+ pshufd $78,%xmm3,%xmm4
+
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
+.byte 102,68,15,58,68,234,17
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa L$7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ movdqu 16(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm4
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $0x40,%rcx
+ jnc L$mod4_loop
+
+L$tail4x:
+.byte 102,65,15,58,68,199,0
+.byte 102,65,15,58,68,207,17
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm4,%xmm8
+
+ pxor %xmm1,%xmm8
+ pxor %xmm0,%xmm1
+
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $0x40,%rcx
+ jz L$done
+ movdqu 32(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz L$odd_tail
+L$skip4x:
+
+
+
+
+
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
+
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ leaq 32(%rdx),%rdx
+ nop
+ subq $0x20,%rcx
+ jbe L$even_tail
+ nop
+ jmp L$mod_loop
+
+.p2align 5
+L$mod_loop:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm3,%xmm5
+
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
+ psllq $1,%xmm0
+ pxor %xmm8,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm8
+ pslldq $8,%xmm0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
+ psrlq $1,%xmm0
+.byte 102,15,58,68,231,0
+ pxor %xmm1,%xmm0
+
+ subq $0x20,%rcx
+ ja L$mod_loop
+
+L$even_tail:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testq %rcx,%rcx
+ jnz L$done
+
+L$odd_tail:
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,223,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+L$done:
+.byte 102,65,15,56,0,194
+ movdqu %xmm0,(%rdi)
+ ret
+
+
+
+.globl _gcm_init_avx
+.private_extern _gcm_init_avx
+
+.p2align 5
+_gcm_init_avx:
+
+_CET_ENDBR
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp L$init_start_avx
+.p2align 5
+L$init_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+L$init_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz L$init_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ ret
+
+
+
+.globl _gcm_gmult_avx
+.private_extern _gcm_gmult_avx
+
+.p2align 5
+_gcm_gmult_avx:
+
+_CET_ENDBR
+ jmp L$_gmult_clmul
+
+
+.globl _gcm_ghash_avx
+.private_extern _gcm_ghash_avx
+
+.p2align 5
+_gcm_ghash_avx:
+
+_CET_ENDBR
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq L$0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu L$bswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb L$short_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb L$tail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp L$oop8x_avx
+
+.p2align 5
+L$oop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc L$oop8x_avx
+
+ addq $0x80,%rcx
+ jmp L$tail_no_xor_avx
+
+.p2align 5
+L$short_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz L$tail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp L$tail_avx
+
+.p2align 5
+L$tail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+L$tail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne L$short_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ ret
+
+
+
+.section __DATA,__const
+.p2align 6
+L$bswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$0x1c2_polynomial:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$7_mask:
+.long 7,0,7,0
+.p2align 6
+
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+.text
+#endif
diff --git a/gen/bcm/ghash-x86_64-linux.S b/gen/bcm/ghash-x86_64-linux.S
new file mode 100644
index 0000000..22429a6
--- /dev/null
+++ b/gen/bcm/ghash-x86_64-linux.S
@@ -0,0 +1,1125 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+.globl gcm_init_clmul
+.hidden gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+.cfi_startproc
+
+_CET_ENDBR
+.L_init_clmul:
+ movdqu (%rsi),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+
+
+ pand .L0x1c2_polynomial(%rip),%xmm5
+ pxor %xmm5,%xmm2
+
+
+ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
+ ret
+.cfi_endproc
+
+.size gcm_init_clmul,.-gcm_init_clmul
+.globl gcm_gmult_clmul
+.hidden gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.cfi_startproc
+_CET_ENDBR
+.L_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa .Lbswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ ret
+.cfi_endproc
+.size gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl gcm_ghash_clmul
+.hidden gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 32
+gcm_ghash_clmul:
+.cfi_startproc
+
+_CET_ENDBR
+.L_ghash_clmul:
+ movdqa .Lbswap_mask(%rip),%xmm10
+
+ movdqu (%rdi),%xmm0
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
+
+ subq $0x10,%rcx
+ jz .Lodd_tail
+
+ movdqu 16(%rsi),%xmm6
+ cmpq $0x30,%rcx
+ jb .Lskip4x
+
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
+
+
+
+
+ movdqu 48(%rdx),%xmm3
+ movdqu 32(%rdx),%xmm11
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+.byte 102,69,15,58,68,238,17
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $0x40,%rcx
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+ movdqu 48(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
+ movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
+ pshufd $78,%xmm11,%xmm12
+ xorps %xmm5,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
+.byte 102,68,15,58,68,218,0
+ pshufd $78,%xmm3,%xmm4
+
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
+.byte 102,68,15,58,68,234,17
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ movdqu 16(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm4
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $0x40,%rcx
+ jnc .Lmod4_loop
+
+.Ltail4x:
+.byte 102,65,15,58,68,199,0
+.byte 102,65,15,58,68,207,17
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm4,%xmm8
+
+ pxor %xmm1,%xmm8
+ pxor %xmm0,%xmm1
+
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $0x40,%rcx
+ jz .Ldone
+ movdqu 32(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Lodd_tail
+.Lskip4x:
+
+
+
+
+
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
+
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ leaq 32(%rdx),%rdx
+ nop
+ subq $0x20,%rcx
+ jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+
+.align 32
+.Lmod_loop:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm3,%xmm5
+
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
+ psllq $1,%xmm0
+ pxor %xmm8,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm8
+ pslldq $8,%xmm0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
+ psrlq $1,%xmm0
+.byte 102,15,58,68,231,0
+ pxor %xmm1,%xmm0
+
+ subq $0x20,%rcx
+ ja .Lmod_loop
+
+.Leven_tail:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testq %rcx,%rcx
+ jnz .Ldone
+
+.Lodd_tail:
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,223,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.Ldone:
+.byte 102,65,15,56,0,194
+ movdqu %xmm0,(%rdi)
+ ret
+.cfi_endproc
+
+.size gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl gcm_init_avx
+.hidden gcm_init_avx
+.type gcm_init_avx,@function
+.align 32
+gcm_init_avx:
+.cfi_startproc
+_CET_ENDBR
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ ret
+
+.cfi_endproc
+.size gcm_init_avx,.-gcm_init_avx
+.globl gcm_gmult_avx
+.hidden gcm_gmult_avx
+.type gcm_gmult_avx,@function
+.align 32
+gcm_gmult_avx:
+.cfi_startproc
+_CET_ENDBR
+ jmp .L_gmult_clmul
+.cfi_endproc
+.size gcm_gmult_avx,.-gcm_gmult_avx
+.globl gcm_ghash_avx
+.hidden gcm_ghash_avx
+.type gcm_ghash_avx,@function
+.align 32
+gcm_ghash_avx:
+.cfi_startproc
+_CET_ENDBR
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ ret
+.cfi_endproc
+
+.size gcm_ghash_avx,.-gcm_ghash_avx
+.section .rodata
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.align 64
+
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+.text
+#endif
diff --git a/gen/bcm/ghash-x86_64-win.asm b/gen/bcm/ghash-x86_64-win.asm
new file mode 100644
index 0000000..41b189a
--- /dev/null
+++ b/gen/bcm/ghash-x86_64-win.asm
@@ -0,0 +1,1336 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+global gcm_init_clmul
+
+ALIGN 16
+gcm_init_clmul:
+
+$L$SEH_begin_gcm_init_clmul_1:
+_CET_ENDBR
+$L$_init_clmul:
+ sub rsp,0x18
+$L$SEH_prolog_gcm_init_clmul_2:
+ movaps XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_init_clmul_3:
+ movdqu xmm2,XMMWORD[rdx]
+ pshufd xmm2,xmm2,78
+
+
+ pshufd xmm4,xmm2,255
+ movdqa xmm3,xmm2
+ psllq xmm2,1
+ pxor xmm5,xmm5
+ psrlq xmm3,63
+ pcmpgtd xmm5,xmm4
+ pslldq xmm3,8
+ por xmm2,xmm3
+
+
+ pand xmm5,XMMWORD[$L$0x1c2_polynomial]
+ pxor xmm2,xmm5
+
+
+ pshufd xmm6,xmm2,78
+ movdqa xmm0,xmm2
+ pxor xmm6,xmm2
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pxor xmm3,xmm0
+DB 102,15,58,68,194,0
+DB 102,15,58,68,202,17
+DB 102,15,58,68,222,0
+ pxor xmm3,xmm0
+ pxor xmm3,xmm1
+
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+
+
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ pshufd xmm3,xmm2,78
+ pshufd xmm4,xmm0,78
+ pxor xmm3,xmm2
+ movdqu XMMWORD[rcx],xmm2
+ pxor xmm4,xmm0
+ movdqu XMMWORD[16+rcx],xmm0
+DB 102,15,58,15,227,8
+ movdqu XMMWORD[32+rcx],xmm4
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pxor xmm3,xmm0
+DB 102,15,58,68,194,0
+DB 102,15,58,68,202,17
+DB 102,15,58,68,222,0
+ pxor xmm3,xmm0
+ pxor xmm3,xmm1
+
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+
+
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ movdqa xmm5,xmm0
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pxor xmm3,xmm0
+DB 102,15,58,68,194,0
+DB 102,15,58,68,202,17
+DB 102,15,58,68,222,0
+ pxor xmm3,xmm0
+ pxor xmm3,xmm1
+
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+
+
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ pshufd xmm3,xmm5,78
+ pshufd xmm4,xmm0,78
+ pxor xmm3,xmm5
+ movdqu XMMWORD[48+rcx],xmm5
+ pxor xmm4,xmm0
+ movdqu XMMWORD[64+rcx],xmm0
+DB 102,15,58,15,227,8
+ movdqu XMMWORD[80+rcx],xmm4
+ movaps xmm6,XMMWORD[rsp]
+ lea rsp,[24+rsp]
+ ret
+
+$L$SEH_end_gcm_init_clmul_4:
+
+global gcm_gmult_clmul
+
+ALIGN 16
+gcm_gmult_clmul:
+
+_CET_ENDBR
+$L$_gmult_clmul:
+ movdqu xmm0,XMMWORD[rcx]
+ movdqa xmm5,XMMWORD[$L$bswap_mask]
+ movdqu xmm2,XMMWORD[rdx]
+ movdqu xmm4,XMMWORD[32+rdx]
+DB 102,15,56,0,197
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pxor xmm3,xmm0
+DB 102,15,58,68,194,0
+DB 102,15,58,68,202,17
+DB 102,15,58,68,220,0
+ pxor xmm3,xmm0
+ pxor xmm3,xmm1
+
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+
+
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+DB 102,15,56,0,197
+ movdqu XMMWORD[rcx],xmm0
+ ret
+
+
+global gcm_ghash_clmul
+
+ALIGN 32
+gcm_ghash_clmul:
+
+$L$SEH_begin_gcm_ghash_clmul_1:
+_CET_ENDBR
+$L$_ghash_clmul:
+ lea rax,[((-136))+rsp]
+ lea rsp,[((-32))+rax]
+$L$SEH_prolog_gcm_ghash_clmul_2:
+ movaps XMMWORD[(-32)+rax],xmm6
+$L$SEH_prolog_gcm_ghash_clmul_3:
+ movaps XMMWORD[(-16)+rax],xmm7
+$L$SEH_prolog_gcm_ghash_clmul_4:
+ movaps XMMWORD[rax],xmm8
+$L$SEH_prolog_gcm_ghash_clmul_5:
+ movaps XMMWORD[16+rax],xmm9
+$L$SEH_prolog_gcm_ghash_clmul_6:
+ movaps XMMWORD[32+rax],xmm10
+$L$SEH_prolog_gcm_ghash_clmul_7:
+ movaps XMMWORD[48+rax],xmm11
+$L$SEH_prolog_gcm_ghash_clmul_8:
+ movaps XMMWORD[64+rax],xmm12
+$L$SEH_prolog_gcm_ghash_clmul_9:
+ movaps XMMWORD[80+rax],xmm13
+$L$SEH_prolog_gcm_ghash_clmul_10:
+ movaps XMMWORD[96+rax],xmm14
+$L$SEH_prolog_gcm_ghash_clmul_11:
+ movaps XMMWORD[112+rax],xmm15
+$L$SEH_prolog_gcm_ghash_clmul_12:
+ movdqa xmm10,XMMWORD[$L$bswap_mask]
+
+ movdqu xmm0,XMMWORD[rcx]
+ movdqu xmm2,XMMWORD[rdx]
+ movdqu xmm7,XMMWORD[32+rdx]
+DB 102,65,15,56,0,194
+
+ sub r9,0x10
+ jz NEAR $L$odd_tail
+
+ movdqu xmm6,XMMWORD[16+rdx]
+ cmp r9,0x30
+ jb NEAR $L$skip4x
+
+ sub r9,0x30
+ mov rax,0xA040608020C0E000
+ movdqu xmm14,XMMWORD[48+rdx]
+ movdqu xmm15,XMMWORD[64+rdx]
+
+
+
+
+ movdqu xmm3,XMMWORD[48+r8]
+ movdqu xmm11,XMMWORD[32+r8]
+DB 102,65,15,56,0,218
+DB 102,69,15,56,0,218
+ movdqa xmm5,xmm3
+ pshufd xmm4,xmm3,78
+ pxor xmm4,xmm3
+DB 102,15,58,68,218,0
+DB 102,15,58,68,234,17
+DB 102,15,58,68,231,0
+
+ movdqa xmm13,xmm11
+ pshufd xmm12,xmm11,78
+ pxor xmm12,xmm11
+DB 102,68,15,58,68,222,0
+DB 102,68,15,58,68,238,17
+DB 102,68,15,58,68,231,16
+ xorps xmm3,xmm11
+ xorps xmm5,xmm13
+ movups xmm7,XMMWORD[80+rdx]
+ xorps xmm4,xmm12
+
+ movdqu xmm11,XMMWORD[16+r8]
+ movdqu xmm8,XMMWORD[r8]
+DB 102,69,15,56,0,218
+DB 102,69,15,56,0,194
+ movdqa xmm13,xmm11
+ pshufd xmm12,xmm11,78
+ pxor xmm0,xmm8
+ pxor xmm12,xmm11
+DB 102,69,15,58,68,222,0
+ movdqa xmm1,xmm0
+ pshufd xmm8,xmm0,78
+ pxor xmm8,xmm0
+DB 102,69,15,58,68,238,17
+DB 102,68,15,58,68,231,0
+ xorps xmm3,xmm11
+ xorps xmm5,xmm13
+
+ lea r8,[64+r8]
+ sub r9,0x40
+ jc NEAR $L$tail4x
+
+ jmp NEAR $L$mod4_loop
+ALIGN 32
+$L$mod4_loop:
+DB 102,65,15,58,68,199,0
+ xorps xmm4,xmm12
+ movdqu xmm11,XMMWORD[48+r8]
+DB 102,69,15,56,0,218
+DB 102,65,15,58,68,207,17
+ xorps xmm0,xmm3
+ movdqu xmm3,XMMWORD[32+r8]
+ movdqa xmm13,xmm11
+DB 102,68,15,58,68,199,16
+ pshufd xmm12,xmm11,78
+ xorps xmm1,xmm5
+ pxor xmm12,xmm11
+DB 102,65,15,56,0,218
+ movups xmm7,XMMWORD[32+rdx]
+ xorps xmm8,xmm4
+DB 102,68,15,58,68,218,0
+ pshufd xmm4,xmm3,78
+
+ pxor xmm8,xmm0
+ movdqa xmm5,xmm3
+ pxor xmm8,xmm1
+ pxor xmm4,xmm3
+ movdqa xmm9,xmm8
+DB 102,68,15,58,68,234,17
+ pslldq xmm8,8
+ psrldq xmm9,8
+ pxor xmm0,xmm8
+ movdqa xmm8,XMMWORD[$L$7_mask]
+ pxor xmm1,xmm9
+DB 102,76,15,110,200
+
+ pand xmm8,xmm0
+DB 102,69,15,56,0,200
+ pxor xmm9,xmm0
+DB 102,68,15,58,68,231,0
+ psllq xmm9,57
+ movdqa xmm8,xmm9
+ pslldq xmm9,8
+DB 102,15,58,68,222,0
+ psrldq xmm8,8
+ pxor xmm0,xmm9
+ pxor xmm1,xmm8
+ movdqu xmm8,XMMWORD[r8]
+
+ movdqa xmm9,xmm0
+ psrlq xmm0,1
+DB 102,15,58,68,238,17
+ xorps xmm3,xmm11
+ movdqu xmm11,XMMWORD[16+r8]
+DB 102,69,15,56,0,218
+DB 102,15,58,68,231,16
+ xorps xmm5,xmm13
+ movups xmm7,XMMWORD[80+rdx]
+DB 102,69,15,56,0,194
+ pxor xmm1,xmm9
+ pxor xmm9,xmm0
+ psrlq xmm0,5
+
+ movdqa xmm13,xmm11
+ pxor xmm4,xmm12
+ pshufd xmm12,xmm11,78
+ pxor xmm0,xmm9
+ pxor xmm1,xmm8
+ pxor xmm12,xmm11
+DB 102,69,15,58,68,222,0
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ movdqa xmm1,xmm0
+DB 102,69,15,58,68,238,17
+ xorps xmm3,xmm11
+ pshufd xmm8,xmm0,78
+ pxor xmm8,xmm0
+
+DB 102,68,15,58,68,231,0
+ xorps xmm5,xmm13
+
+ lea r8,[64+r8]
+ sub r9,0x40
+ jnc NEAR $L$mod4_loop
+
+$L$tail4x:
+DB 102,65,15,58,68,199,0
+DB 102,65,15,58,68,207,17
+DB 102,68,15,58,68,199,16
+ xorps xmm4,xmm12
+ xorps xmm0,xmm3
+ xorps xmm1,xmm5
+ pxor xmm1,xmm0
+ pxor xmm8,xmm4
+
+ pxor xmm8,xmm1
+ pxor xmm1,xmm0
+
+ movdqa xmm9,xmm8
+ psrldq xmm8,8
+ pslldq xmm9,8
+ pxor xmm1,xmm8
+ pxor xmm0,xmm9
+
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+
+
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ add r9,0x40
+ jz NEAR $L$done
+ movdqu xmm7,XMMWORD[32+rdx]
+ sub r9,0x10
+ jz NEAR $L$odd_tail
+$L$skip4x:
+
+
+
+
+
+ movdqu xmm8,XMMWORD[r8]
+ movdqu xmm3,XMMWORD[16+r8]
+DB 102,69,15,56,0,194
+DB 102,65,15,56,0,218
+ pxor xmm0,xmm8
+
+ movdqa xmm5,xmm3
+ pshufd xmm4,xmm3,78
+ pxor xmm4,xmm3
+DB 102,15,58,68,218,0
+DB 102,15,58,68,234,17
+DB 102,15,58,68,231,0
+
+ lea r8,[32+r8]
+ nop
+ sub r9,0x20
+ jbe NEAR $L$even_tail
+ nop
+ jmp NEAR $L$mod_loop
+
+ALIGN 32
+$L$mod_loop:
+ movdqa xmm1,xmm0
+ movdqa xmm8,xmm4
+ pshufd xmm4,xmm0,78
+ pxor xmm4,xmm0
+
+DB 102,15,58,68,198,0
+DB 102,15,58,68,206,17
+DB 102,15,58,68,231,16
+
+ pxor xmm0,xmm3
+ pxor xmm1,xmm5
+ movdqu xmm9,XMMWORD[r8]
+ pxor xmm8,xmm0
+DB 102,69,15,56,0,202
+ movdqu xmm3,XMMWORD[16+r8]
+
+ pxor xmm8,xmm1
+ pxor xmm1,xmm9
+ pxor xmm4,xmm8
+DB 102,65,15,56,0,218
+ movdqa xmm8,xmm4
+ psrldq xmm8,8
+ pslldq xmm4,8
+ pxor xmm1,xmm8
+ pxor xmm0,xmm4
+
+ movdqa xmm5,xmm3
+
+ movdqa xmm9,xmm0
+ movdqa xmm8,xmm0
+ psllq xmm0,5
+ pxor xmm8,xmm0
+DB 102,15,58,68,218,0
+ psllq xmm0,1
+ pxor xmm0,xmm8
+ psllq xmm0,57
+ movdqa xmm8,xmm0
+ pslldq xmm0,8
+ psrldq xmm8,8
+ pxor xmm0,xmm9
+ pshufd xmm4,xmm5,78
+ pxor xmm1,xmm8
+ pxor xmm4,xmm5
+
+ movdqa xmm9,xmm0
+ psrlq xmm0,1
+DB 102,15,58,68,234,17
+ pxor xmm1,xmm9
+ pxor xmm9,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm9
+ lea r8,[32+r8]
+ psrlq xmm0,1
+DB 102,15,58,68,231,0
+ pxor xmm0,xmm1
+
+ sub r9,0x20
+ ja NEAR $L$mod_loop
+
+$L$even_tail:
+ movdqa xmm1,xmm0
+ movdqa xmm8,xmm4
+ pshufd xmm4,xmm0,78
+ pxor xmm4,xmm0
+
+DB 102,15,58,68,198,0
+DB 102,15,58,68,206,17
+DB 102,15,58,68,231,16
+
+ pxor xmm0,xmm3
+ pxor xmm1,xmm5
+ pxor xmm8,xmm0
+ pxor xmm8,xmm1
+ pxor xmm4,xmm8
+ movdqa xmm8,xmm4
+ psrldq xmm8,8
+ pslldq xmm4,8
+ pxor xmm1,xmm8
+ pxor xmm0,xmm4
+
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+
+
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+ test r9,r9
+ jnz NEAR $L$done
+
+$L$odd_tail:
+ movdqu xmm8,XMMWORD[r8]
+DB 102,69,15,56,0,194
+ pxor xmm0,xmm8
+ movdqa xmm1,xmm0
+ pshufd xmm3,xmm0,78
+ pxor xmm3,xmm0
+DB 102,15,58,68,194,0
+DB 102,15,58,68,202,17
+DB 102,15,58,68,223,0
+ pxor xmm3,xmm0
+ pxor xmm3,xmm1
+
+ movdqa xmm4,xmm3
+ psrldq xmm3,8
+ pslldq xmm4,8
+ pxor xmm1,xmm3
+ pxor xmm0,xmm4
+
+ movdqa xmm4,xmm0
+ movdqa xmm3,xmm0
+ psllq xmm0,5
+ pxor xmm3,xmm0
+ psllq xmm0,1
+ pxor xmm0,xmm3
+ psllq xmm0,57
+ movdqa xmm3,xmm0
+ pslldq xmm0,8
+ psrldq xmm3,8
+ pxor xmm0,xmm4
+ pxor xmm1,xmm3
+
+
+ movdqa xmm4,xmm0
+ psrlq xmm0,1
+ pxor xmm1,xmm4
+ pxor xmm4,xmm0
+ psrlq xmm0,5
+ pxor xmm0,xmm4
+ psrlq xmm0,1
+ pxor xmm0,xmm1
+$L$done:
+DB 102,65,15,56,0,194
+ movdqu XMMWORD[rcx],xmm0
+ movaps xmm6,XMMWORD[rsp]
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps xmm15,XMMWORD[144+rsp]
+ lea rsp,[168+rsp]
+ ret
+
+$L$SEH_end_gcm_ghash_clmul_13:
+
+global gcm_init_avx
+
+ALIGN 32
+gcm_init_avx:
+
+_CET_ENDBR
+$L$SEH_begin_gcm_init_avx_1:
+ sub rsp,0x18
+$L$SEH_prolog_gcm_init_avx_2:
+ movaps XMMWORD[rsp],xmm6
+$L$SEH_prolog_gcm_init_avx_3:
+ vzeroupper
+
+ vmovdqu xmm2,XMMWORD[rdx]
+ vpshufd xmm2,xmm2,78
+
+
+ vpshufd xmm4,xmm2,255
+ vpsrlq xmm3,xmm2,63
+ vpsllq xmm2,xmm2,1
+ vpxor xmm5,xmm5,xmm5
+ vpcmpgtd xmm5,xmm5,xmm4
+ vpslldq xmm3,xmm3,8
+ vpor xmm2,xmm2,xmm3
+
+
+ vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
+ vpxor xmm2,xmm2,xmm5
+
+ vpunpckhqdq xmm6,xmm2,xmm2
+ vmovdqa xmm0,xmm2
+ vpxor xmm6,xmm6,xmm2
+ mov r10,4
+ jmp NEAR $L$init_start_avx
+ALIGN 32
+$L$init_loop_avx:
+ vpalignr xmm5,xmm4,xmm3,8
+ vmovdqu XMMWORD[(-16)+rcx],xmm5
+ vpunpckhqdq xmm3,xmm0,xmm0
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm1,xmm0,xmm2,0x11
+ vpclmulqdq xmm0,xmm0,xmm2,0x00
+ vpclmulqdq xmm3,xmm3,xmm6,0x00
+ vpxor xmm4,xmm1,xmm0
+ vpxor xmm3,xmm3,xmm4
+
+ vpslldq xmm4,xmm3,8
+ vpsrldq xmm3,xmm3,8
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm1,xmm1,xmm3
+ vpsllq xmm3,xmm0,57
+ vpsllq xmm4,xmm0,62
+ vpxor xmm4,xmm4,xmm3
+ vpsllq xmm3,xmm0,63
+ vpxor xmm4,xmm4,xmm3
+ vpslldq xmm3,xmm4,8
+ vpsrldq xmm4,xmm4,8
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm1,xmm1,xmm4
+
+ vpsrlq xmm4,xmm0,1
+ vpxor xmm1,xmm1,xmm0
+ vpxor xmm0,xmm0,xmm4
+ vpsrlq xmm4,xmm4,5
+ vpxor xmm0,xmm0,xmm4
+ vpsrlq xmm0,xmm0,1
+ vpxor xmm0,xmm0,xmm1
+$L$init_start_avx:
+ vmovdqa xmm5,xmm0
+ vpunpckhqdq xmm3,xmm0,xmm0
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm1,xmm0,xmm2,0x11
+ vpclmulqdq xmm0,xmm0,xmm2,0x00
+ vpclmulqdq xmm3,xmm3,xmm6,0x00
+ vpxor xmm4,xmm1,xmm0
+ vpxor xmm3,xmm3,xmm4
+
+ vpslldq xmm4,xmm3,8
+ vpsrldq xmm3,xmm3,8
+ vpxor xmm0,xmm0,xmm4
+ vpxor xmm1,xmm1,xmm3
+ vpsllq xmm3,xmm0,57
+ vpsllq xmm4,xmm0,62
+ vpxor xmm4,xmm4,xmm3
+ vpsllq xmm3,xmm0,63
+ vpxor xmm4,xmm4,xmm3
+ vpslldq xmm3,xmm4,8
+ vpsrldq xmm4,xmm4,8
+ vpxor xmm0,xmm0,xmm3
+ vpxor xmm1,xmm1,xmm4
+
+ vpsrlq xmm4,xmm0,1
+ vpxor xmm1,xmm1,xmm0
+ vpxor xmm0,xmm0,xmm4
+ vpsrlq xmm4,xmm4,5
+ vpxor xmm0,xmm0,xmm4
+ vpsrlq xmm0,xmm0,1
+ vpxor xmm0,xmm0,xmm1
+ vpshufd xmm3,xmm5,78
+ vpshufd xmm4,xmm0,78
+ vpxor xmm3,xmm3,xmm5
+ vmovdqu XMMWORD[rcx],xmm5
+ vpxor xmm4,xmm4,xmm0
+ vmovdqu XMMWORD[16+rcx],xmm0
+ lea rcx,[48+rcx]
+ sub r10,1
+ jnz NEAR $L$init_loop_avx
+
+ vpalignr xmm5,xmm3,xmm4,8
+ vmovdqu XMMWORD[(-16)+rcx],xmm5
+
+ vzeroupper
+ movaps xmm6,XMMWORD[rsp]
+ lea rsp,[24+rsp]
+ ret
+$L$SEH_end_gcm_init_avx_4:
+
+
+global gcm_gmult_avx
+
+ALIGN 32
+gcm_gmult_avx:
+
+_CET_ENDBR
+ jmp NEAR $L$_gmult_clmul
+
+
+global gcm_ghash_avx
+
+ALIGN 32
+gcm_ghash_avx:
+
+_CET_ENDBR
+$L$SEH_begin_gcm_ghash_avx_1:
+ lea rax,[((-136))+rsp]
+ lea rsp,[((-32))+rax]
+$L$SEH_prolog_gcm_ghash_avx_2:
+ movaps XMMWORD[(-32)+rax],xmm6
+$L$SEH_prolog_gcm_ghash_avx_3:
+ movaps XMMWORD[(-16)+rax],xmm7
+$L$SEH_prolog_gcm_ghash_avx_4:
+ movaps XMMWORD[rax],xmm8
+$L$SEH_prolog_gcm_ghash_avx_5:
+ movaps XMMWORD[16+rax],xmm9
+$L$SEH_prolog_gcm_ghash_avx_6:
+ movaps XMMWORD[32+rax],xmm10
+$L$SEH_prolog_gcm_ghash_avx_7:
+ movaps XMMWORD[48+rax],xmm11
+$L$SEH_prolog_gcm_ghash_avx_8:
+ movaps XMMWORD[64+rax],xmm12
+$L$SEH_prolog_gcm_ghash_avx_9:
+ movaps XMMWORD[80+rax],xmm13
+$L$SEH_prolog_gcm_ghash_avx_10:
+ movaps XMMWORD[96+rax],xmm14
+$L$SEH_prolog_gcm_ghash_avx_11:
+ movaps XMMWORD[112+rax],xmm15
+$L$SEH_prolog_gcm_ghash_avx_12:
+ vzeroupper
+
+ vmovdqu xmm10,XMMWORD[rcx]
+ lea r10,[$L$0x1c2_polynomial]
+ lea rdx,[64+rdx]
+ vmovdqu xmm13,XMMWORD[$L$bswap_mask]
+ vpshufb xmm10,xmm10,xmm13
+ cmp r9,0x80
+ jb NEAR $L$short_avx
+ sub r9,0x80
+
+ vmovdqu xmm14,XMMWORD[112+r8]
+ vmovdqu xmm6,XMMWORD[((0-64))+rdx]
+ vpshufb xmm14,xmm14,xmm13
+ vmovdqu xmm7,XMMWORD[((32-64))+rdx]
+
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vmovdqu xmm15,XMMWORD[96+r8]
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpxor xmm9,xmm9,xmm14
+ vpshufb xmm15,xmm15,xmm13
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((16-64))+rdx]
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vmovdqu xmm14,XMMWORD[80+r8]
+ vpclmulqdq xmm2,xmm9,xmm7,0x00
+ vpxor xmm8,xmm8,xmm15
+
+ vpshufb xmm14,xmm14,xmm13
+ vpclmulqdq xmm3,xmm15,xmm6,0x00
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vpclmulqdq xmm4,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((48-64))+rdx]
+ vpxor xmm9,xmm9,xmm14
+ vmovdqu xmm15,XMMWORD[64+r8]
+ vpclmulqdq xmm5,xmm8,xmm7,0x10
+ vmovdqu xmm7,XMMWORD[((80-64))+rdx]
+
+ vpshufb xmm15,xmm15,xmm13
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpxor xmm4,xmm4,xmm1
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((64-64))+rdx]
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm9,xmm7,0x00
+ vpxor xmm8,xmm8,xmm15
+
+ vmovdqu xmm14,XMMWORD[48+r8]
+ vpxor xmm0,xmm0,xmm3
+ vpclmulqdq xmm3,xmm15,xmm6,0x00
+ vpxor xmm1,xmm1,xmm4
+ vpshufb xmm14,xmm14,xmm13
+ vpclmulqdq xmm4,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((96-64))+rdx]
+ vpxor xmm2,xmm2,xmm5
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vpclmulqdq xmm5,xmm8,xmm7,0x10
+ vmovdqu xmm7,XMMWORD[((128-64))+rdx]
+ vpxor xmm9,xmm9,xmm14
+
+ vmovdqu xmm15,XMMWORD[32+r8]
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpxor xmm4,xmm4,xmm1
+ vpshufb xmm15,xmm15,xmm13
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((112-64))+rdx]
+ vpxor xmm5,xmm5,xmm2
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpclmulqdq xmm2,xmm9,xmm7,0x00
+ vpxor xmm8,xmm8,xmm15
+
+ vmovdqu xmm14,XMMWORD[16+r8]
+ vpxor xmm0,xmm0,xmm3
+ vpclmulqdq xmm3,xmm15,xmm6,0x00
+ vpxor xmm1,xmm1,xmm4
+ vpshufb xmm14,xmm14,xmm13
+ vpclmulqdq xmm4,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((144-64))+rdx]
+ vpxor xmm2,xmm2,xmm5
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vpclmulqdq xmm5,xmm8,xmm7,0x10
+ vmovdqu xmm7,XMMWORD[((176-64))+rdx]
+ vpxor xmm9,xmm9,xmm14
+
+ vmovdqu xmm15,XMMWORD[r8]
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpxor xmm4,xmm4,xmm1
+ vpshufb xmm15,xmm15,xmm13
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((160-64))+rdx]
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm9,xmm7,0x10
+
+ lea r8,[128+r8]
+ cmp r9,0x80
+ jb NEAR $L$tail_avx
+
+ vpxor xmm15,xmm15,xmm10
+ sub r9,0x80
+ jmp NEAR $L$oop8x_avx
+
+ALIGN 32
+$L$oop8x_avx:
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vmovdqu xmm14,XMMWORD[112+r8]
+ vpxor xmm3,xmm3,xmm0
+ vpxor xmm8,xmm8,xmm15
+ vpclmulqdq xmm10,xmm15,xmm6,0x00
+ vpshufb xmm14,xmm14,xmm13
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm11,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((0-64))+rdx]
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm12,xmm8,xmm7,0x00
+ vmovdqu xmm7,XMMWORD[((32-64))+rdx]
+ vpxor xmm9,xmm9,xmm14
+
+ vmovdqu xmm15,XMMWORD[96+r8]
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpxor xmm10,xmm10,xmm3
+ vpshufb xmm15,xmm15,xmm13
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vxorps xmm11,xmm11,xmm4
+ vmovdqu xmm6,XMMWORD[((16-64))+rdx]
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpclmulqdq xmm2,xmm9,xmm7,0x00
+ vpxor xmm12,xmm12,xmm5
+ vxorps xmm8,xmm8,xmm15
+
+ vmovdqu xmm14,XMMWORD[80+r8]
+ vpxor xmm12,xmm12,xmm10
+ vpclmulqdq xmm3,xmm15,xmm6,0x00
+ vpxor xmm12,xmm12,xmm11
+ vpslldq xmm9,xmm12,8
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm4,xmm15,xmm6,0x11
+ vpsrldq xmm12,xmm12,8
+ vpxor xmm10,xmm10,xmm9
+ vmovdqu xmm6,XMMWORD[((48-64))+rdx]
+ vpshufb xmm14,xmm14,xmm13
+ vxorps xmm11,xmm11,xmm12
+ vpxor xmm4,xmm4,xmm1
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vpclmulqdq xmm5,xmm8,xmm7,0x10
+ vmovdqu xmm7,XMMWORD[((80-64))+rdx]
+ vpxor xmm9,xmm9,xmm14
+ vpxor xmm5,xmm5,xmm2
+
+ vmovdqu xmm15,XMMWORD[64+r8]
+ vpalignr xmm12,xmm10,xmm10,8
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpshufb xmm15,xmm15,xmm13
+ vpxor xmm0,xmm0,xmm3
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((64-64))+rdx]
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm2,xmm9,xmm7,0x00
+ vxorps xmm8,xmm8,xmm15
+ vpxor xmm2,xmm2,xmm5
+
+ vmovdqu xmm14,XMMWORD[48+r8]
+ vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
+ vpclmulqdq xmm3,xmm15,xmm6,0x00
+ vpshufb xmm14,xmm14,xmm13
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm4,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((96-64))+rdx]
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm5,xmm8,xmm7,0x10
+ vmovdqu xmm7,XMMWORD[((128-64))+rdx]
+ vpxor xmm9,xmm9,xmm14
+ vpxor xmm5,xmm5,xmm2
+
+ vmovdqu xmm15,XMMWORD[32+r8]
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpshufb xmm15,xmm15,xmm13
+ vpxor xmm0,xmm0,xmm3
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((112-64))+rdx]
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm2,xmm9,xmm7,0x00
+ vpxor xmm8,xmm8,xmm15
+ vpxor xmm2,xmm2,xmm5
+ vxorps xmm10,xmm10,xmm12
+
+ vmovdqu xmm14,XMMWORD[16+r8]
+ vpalignr xmm12,xmm10,xmm10,8
+ vpclmulqdq xmm3,xmm15,xmm6,0x00
+ vpshufb xmm14,xmm14,xmm13
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm4,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((144-64))+rdx]
+ vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
+ vxorps xmm12,xmm12,xmm11
+ vpunpckhqdq xmm9,xmm14,xmm14
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm5,xmm8,xmm7,0x10
+ vmovdqu xmm7,XMMWORD[((176-64))+rdx]
+ vpxor xmm9,xmm9,xmm14
+ vpxor xmm5,xmm5,xmm2
+
+ vmovdqu xmm15,XMMWORD[r8]
+ vpclmulqdq xmm0,xmm14,xmm6,0x00
+ vpshufb xmm15,xmm15,xmm13
+ vpclmulqdq xmm1,xmm14,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((160-64))+rdx]
+ vpxor xmm15,xmm15,xmm12
+ vpclmulqdq xmm2,xmm9,xmm7,0x10
+ vpxor xmm15,xmm15,xmm10
+
+ lea r8,[128+r8]
+ sub r9,0x80
+ jnc NEAR $L$oop8x_avx
+
+ add r9,0x80
+ jmp NEAR $L$tail_no_xor_avx
+
+ALIGN 32
+$L$short_avx:
+ vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8]
+ lea r8,[r9*1+r8]
+ vmovdqu xmm6,XMMWORD[((0-64))+rdx]
+ vmovdqu xmm7,XMMWORD[((32-64))+rdx]
+ vpshufb xmm15,xmm14,xmm13
+
+ vmovdqa xmm3,xmm0
+ vmovdqa xmm4,xmm1
+ vmovdqa xmm5,xmm2
+ sub r9,0x10
+ jz NEAR $L$tail_avx
+
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm15,xmm6,0x00
+ vpxor xmm8,xmm8,xmm15
+ vmovdqu xmm14,XMMWORD[((-32))+r8]
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm1,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((16-64))+rdx]
+ vpshufb xmm15,xmm14,xmm13
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm8,xmm7,0x00
+ vpsrldq xmm7,xmm7,8
+ sub r9,0x10
+ jz NEAR $L$tail_avx
+
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm15,xmm6,0x00
+ vpxor xmm8,xmm8,xmm15
+ vmovdqu xmm14,XMMWORD[((-48))+r8]
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm1,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((48-64))+rdx]
+ vpshufb xmm15,xmm14,xmm13
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm8,xmm7,0x00
+ vmovdqu xmm7,XMMWORD[((80-64))+rdx]
+ sub r9,0x10
+ jz NEAR $L$tail_avx
+
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm15,xmm6,0x00
+ vpxor xmm8,xmm8,xmm15
+ vmovdqu xmm14,XMMWORD[((-64))+r8]
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm1,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((64-64))+rdx]
+ vpshufb xmm15,xmm14,xmm13
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm8,xmm7,0x00
+ vpsrldq xmm7,xmm7,8
+ sub r9,0x10
+ jz NEAR $L$tail_avx
+
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm15,xmm6,0x00
+ vpxor xmm8,xmm8,xmm15
+ vmovdqu xmm14,XMMWORD[((-80))+r8]
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm1,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((96-64))+rdx]
+ vpshufb xmm15,xmm14,xmm13
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm8,xmm7,0x00
+ vmovdqu xmm7,XMMWORD[((128-64))+rdx]
+ sub r9,0x10
+ jz NEAR $L$tail_avx
+
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm15,xmm6,0x00
+ vpxor xmm8,xmm8,xmm15
+ vmovdqu xmm14,XMMWORD[((-96))+r8]
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm1,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((112-64))+rdx]
+ vpshufb xmm15,xmm14,xmm13
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm8,xmm7,0x00
+ vpsrldq xmm7,xmm7,8
+ sub r9,0x10
+ jz NEAR $L$tail_avx
+
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm15,xmm6,0x00
+ vpxor xmm8,xmm8,xmm15
+ vmovdqu xmm14,XMMWORD[((-112))+r8]
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm1,xmm15,xmm6,0x11
+ vmovdqu xmm6,XMMWORD[((144-64))+rdx]
+ vpshufb xmm15,xmm14,xmm13
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm8,xmm7,0x00
+ vmovq xmm7,QWORD[((184-64))+rdx]
+ sub r9,0x10
+ jmp NEAR $L$tail_avx
+
+ALIGN 32
+$L$tail_avx:
+ vpxor xmm15,xmm15,xmm10
+$L$tail_no_xor_avx:
+ vpunpckhqdq xmm8,xmm15,xmm15
+ vpxor xmm3,xmm3,xmm0
+ vpclmulqdq xmm0,xmm15,xmm6,0x00
+ vpxor xmm8,xmm8,xmm15
+ vpxor xmm4,xmm4,xmm1
+ vpclmulqdq xmm1,xmm15,xmm6,0x11
+ vpxor xmm5,xmm5,xmm2
+ vpclmulqdq xmm2,xmm8,xmm7,0x00
+
+ vmovdqu xmm12,XMMWORD[r10]
+
+ vpxor xmm10,xmm3,xmm0
+ vpxor xmm11,xmm4,xmm1
+ vpxor xmm5,xmm5,xmm2
+
+ vpxor xmm5,xmm5,xmm10
+ vpxor xmm5,xmm5,xmm11
+ vpslldq xmm9,xmm5,8
+ vpsrldq xmm5,xmm5,8
+ vpxor xmm10,xmm10,xmm9
+ vpxor xmm11,xmm11,xmm5
+
+ vpclmulqdq xmm9,xmm10,xmm12,0x10
+ vpalignr xmm10,xmm10,xmm10,8
+ vpxor xmm10,xmm10,xmm9
+
+ vpclmulqdq xmm9,xmm10,xmm12,0x10
+ vpalignr xmm10,xmm10,xmm10,8
+ vpxor xmm10,xmm10,xmm11
+ vpxor xmm10,xmm10,xmm9
+
+ cmp r9,0
+ jne NEAR $L$short_avx
+
+ vpshufb xmm10,xmm10,xmm13
+ vmovdqu XMMWORD[rcx],xmm10
+ vzeroupper
+ movaps xmm6,XMMWORD[rsp]
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps xmm15,XMMWORD[144+rsp]
+ lea rsp,[168+rsp]
+ ret
+
+$L$SEH_end_gcm_ghash_avx_13:
+
+section .rdata rdata align=8
+ALIGN 64
+$L$bswap_mask:
+ DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$0x1c2_polynomial:
+ DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$7_mask:
+ DD 7,0,7,0
+ALIGN 64
+
+ DB 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
+ DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+ DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+ DB 114,103,62,0
+ALIGN 64
+section .text
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_gcm_init_clmul_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_init_clmul_4 wrt ..imagebase
+ DD $L$SEH_info_gcm_init_clmul_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_ghash_clmul_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_ghash_clmul_13 wrt ..imagebase
+ DD $L$SEH_info_gcm_ghash_clmul_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_init_avx_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_init_avx_4 wrt ..imagebase
+ DD $L$SEH_info_gcm_init_avx_0 wrt ..imagebase
+
+ DD $L$SEH_begin_gcm_ghash_avx_1 wrt ..imagebase
+ DD $L$SEH_end_gcm_ghash_avx_13 wrt ..imagebase
+ DD $L$SEH_info_gcm_ghash_avx_0 wrt ..imagebase
+
+
+section .xdata rdata align=8
+ALIGN 4
+$L$SEH_info_gcm_init_clmul_0:
+ DB 1
+ DB $L$SEH_prolog_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1
+ DB 3
+ DB 0
+ DB $L$SEH_prolog_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_gcm_init_clmul_2-$L$SEH_begin_gcm_init_clmul_1
+ DB 34
+
+$L$SEH_info_gcm_ghash_clmul_0:
+ DB 1
+ DB $L$SEH_prolog_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 22
+ DB 0
+ DB $L$SEH_prolog_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 248
+ DW 9
+ DB $L$SEH_prolog_gcm_ghash_clmul_11-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 232
+ DW 8
+ DB $L$SEH_prolog_gcm_ghash_clmul_10-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 216
+ DW 7
+ DB $L$SEH_prolog_gcm_ghash_clmul_9-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 200
+ DW 6
+ DB $L$SEH_prolog_gcm_ghash_clmul_8-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 184
+ DW 5
+ DB $L$SEH_prolog_gcm_ghash_clmul_7-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 168
+ DW 4
+ DB $L$SEH_prolog_gcm_ghash_clmul_6-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 152
+ DW 3
+ DB $L$SEH_prolog_gcm_ghash_clmul_5-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 136
+ DW 2
+ DB $L$SEH_prolog_gcm_ghash_clmul_4-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 120
+ DW 1
+ DB $L$SEH_prolog_gcm_ghash_clmul_3-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_gcm_ghash_clmul_2-$L$SEH_begin_gcm_ghash_clmul_1
+ DB 1
+ DW 21
+
+$L$SEH_info_gcm_init_avx_0:
+ DB 1
+ DB $L$SEH_prolog_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1
+ DB 3
+ DB 0
+ DB $L$SEH_prolog_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_gcm_init_avx_2-$L$SEH_begin_gcm_init_avx_1
+ DB 34
+
+$L$SEH_info_gcm_ghash_avx_0:
+ DB 1
+ DB $L$SEH_prolog_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1
+ DB 22
+ DB 0
+ DB $L$SEH_prolog_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1
+ DB 248
+ DW 9
+ DB $L$SEH_prolog_gcm_ghash_avx_11-$L$SEH_begin_gcm_ghash_avx_1
+ DB 232
+ DW 8
+ DB $L$SEH_prolog_gcm_ghash_avx_10-$L$SEH_begin_gcm_ghash_avx_1
+ DB 216
+ DW 7
+ DB $L$SEH_prolog_gcm_ghash_avx_9-$L$SEH_begin_gcm_ghash_avx_1
+ DB 200
+ DW 6
+ DB $L$SEH_prolog_gcm_ghash_avx_8-$L$SEH_begin_gcm_ghash_avx_1
+ DB 184
+ DW 5
+ DB $L$SEH_prolog_gcm_ghash_avx_7-$L$SEH_begin_gcm_ghash_avx_1
+ DB 168
+ DW 4
+ DB $L$SEH_prolog_gcm_ghash_avx_6-$L$SEH_begin_gcm_ghash_avx_1
+ DB 152
+ DW 3
+ DB $L$SEH_prolog_gcm_ghash_avx_5-$L$SEH_begin_gcm_ghash_avx_1
+ DB 136
+ DW 2
+ DB $L$SEH_prolog_gcm_ghash_avx_4-$L$SEH_begin_gcm_ghash_avx_1
+ DB 120
+ DW 1
+ DB $L$SEH_prolog_gcm_ghash_avx_3-$L$SEH_begin_gcm_ghash_avx_1
+ DB 104
+ DW 0
+ DB $L$SEH_prolog_gcm_ghash_avx_2-$L$SEH_begin_gcm_ghash_avx_1
+ DB 1
+ DW 21
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/ghashv8-armv7-linux.S b/gen/bcm/ghashv8-armv7-linux.S
new file mode 100644
index 0000000..fab4c12
--- /dev/null
+++ b/gen/bcm/ghashv8-armv7-linux.S
@@ -0,0 +1,246 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.fpu neon
+.code 32
+#undef __thumb2__
+.globl gcm_init_v8
+.hidden gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ vld1.64 {q9},[r1] @ load input H
+ vmov.i8 q11,#0xe1
+ vshl.i64 q11,q11,#57 @ 0xc2.0
+ vext.8 q3,q9,q9,#8
+ vshr.u64 q10,q11,#63
+ vdup.32 q9,d18[1]
+ vext.8 q8,q10,q11,#8 @ t0=0xc2....01
+ vshr.u64 q10,q3,#63
+ vshr.s32 q9,q9,#31 @ broadcast carry bit
+ vand q10,q10,q8
+ vshl.i64 q3,q3,#1
+ vext.8 q10,q10,q10,#8
+ vand q8,q8,q9
+ vorr q3,q3,q10 @ H<<<=1
+ veor q12,q3,q8 @ twisted H
+ vst1.64 {q12},[r0]! @ store Htable[0]
+
+ @ calculate H^2
+ vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
+.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
+ veor q8,q8,q12
+.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
+.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q14,q0,q10
+
+ vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
+ veor q9,q9,q14
+ vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
+ vst1.64 {q13,q14},[r0]! @ store Htable[1..2]
+ bx lr
+.size gcm_init_v8,.-gcm_init_v8
+.globl gcm_gmult_v8
+.hidden gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ vld1.64 {q9},[r0] @ load Xi
+ vmov.i8 q11,#0xe1
+ vld1.64 {q12,q13},[r1] @ load twisted H, ...
+ vshl.u64 q11,q11,#57
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vext.8 q3,q9,q9,#8
+
+.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ bx lr
+.size gcm_gmult_v8,.-gcm_gmult_v8
+.globl gcm_ghash_v8
+.hidden gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
+ vld1.64 {q0},[r0] @ load [rotated] Xi
+ @ "[rotated]" means that
+ @ loaded value would have
+ @ to be rotated in order to
+ @ make it appear as in
+ @ algorithm specification
+ subs r3,r3,#32 @ see if r3 is 32 or larger
+ mov r12,#16 @ r12 is used as post-
+ @ increment for input pointer;
+ @ as loop is modulo-scheduled
+ @ r12 is zeroed just in time
+ @ to preclude overstepping
+ @ inp[len], which means that
+ @ last block[s] are actually
+ @ loaded twice, but last
+ @ copy is not processed
+ vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
+ vmov.i8 q11,#0xe1
+ vld1.64 {q14},[r1]
+ moveq r12,#0 @ is it time to zero r12?
+ vext.8 q0,q0,q0,#8 @ rotate Xi
+ vld1.64 {q8},[r2]! @ load [rotated] I[0]
+ vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+ vrev64.8 q0,q0
+#endif
+ vext.8 q3,q8,q8,#8 @ rotate I[0]
+ blo .Lodd_tail_v8 @ r3 was less than 32
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vext.8 q7,q9,q9,#8
+ veor q3,q3,q0 @ I[i]^=Xi
+.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q9,q9,q7 @ Karatsuba pre-processing
+.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ b .Loop_mod2x_v8
+
+.align 4
+.Loop_mod2x_v8:
+ vext.8 q10,q3,q3,#8
+ subs r3,r3,#32 @ is there more data?
+.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
+ movlo r12,#0 @ is it time to zero r12?
+
+.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
+ veor q10,q10,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
+ veor q0,q0,q4 @ accumulate
+.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
+
+ veor q2,q2,q6
+ moveq r12,#0 @ is it time to zero r12?
+ veor q1,q1,q5
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+#endif
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ vext.8 q7,q9,q9,#8
+ vext.8 q3,q8,q8,#8
+ veor q0,q1,q10
+.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q3,q3,q2 @ accumulate q3 early
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q3,q3,q10
+ veor q9,q9,q7 @ Karatsuba pre-processing
+ veor q3,q3,q0
+.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
+
+ veor q2,q2,q10
+ vext.8 q3,q8,q8,#8 @ re-construct q3
+ adds r3,r3,#32 @ re-construct r3
+ veor q0,q0,q2 @ re-construct q0
+ beq .Ldone_v8 @ is r3 zero?
+.Lodd_tail_v8:
+ vext.8 q10,q0,q0,#8
+ veor q3,q3,q0 @ inp^=Xi
+ veor q9,q8,q10 @ q9 is rotated inp^Xi
+
+.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+.Ldone_v8:
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
+ bx lr
+.size gcm_ghash_v8,.-gcm_ghash_v8
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/ghashv8-armv8-apple.S b/gen/bcm/ghashv8-armv8-apple.S
new file mode 100644
index 0000000..6bc8a4f
--- /dev/null
+++ b/gen/bcm/ghashv8-armv8-apple.S
@@ -0,0 +1,565 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.globl _gcm_init_v8
+.private_extern _gcm_init_v8
+
+.align 4
+_gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x1] //load input H
+ movi v19.16b,#0xe1
+ shl v19.2d,v19.2d,#57 //0xc2.0
+ ext v3.16b,v17.16b,v17.16b,#8
+ ushr v18.2d,v19.2d,#63
+ dup v17.4s,v17.s[1]
+ ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v3.2d,#63
+ sshr v17.4s,v17.4s,#31 //broadcast carry bit
+ and v18.16b,v18.16b,v16.16b
+ shl v3.2d,v3.2d,#1
+ ext v18.16b,v18.16b,v18.16b,#8
+ and v16.16b,v16.16b,v17.16b
+ orr v3.16b,v3.16b,v18.16b //H<<<=1
+ eor v20.16b,v3.16b,v16.16b //twisted H
+ st1 {v20.2d},[x0],#16 //store Htable[0]
+
+ //calculate H^2
+ ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
+ pmull v0.1q,v20.1d,v20.1d
+ eor v16.16b,v16.16b,v20.16b
+ pmull2 v2.1q,v20.2d,v20.2d
+ pmull v1.1q,v16.1d,v16.1d
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v22.16b,v0.16b,v18.16b
+
+ ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
+ //calculate H^3 and H^4
+ pmull v0.1q,v20.1d, v22.1d
+ pmull v5.1q,v22.1d,v22.1d
+ pmull2 v2.1q,v20.2d, v22.2d
+ pmull2 v7.1q,v22.2d,v22.2d
+ pmull v1.1q,v16.1d,v17.1d
+ pmull v6.1q,v17.1d,v17.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v20.16b, v0.16b,v18.16b //H^3
+ eor v22.16b,v5.16b,v4.16b //H^4
+
+ ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v20.16b
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
+ ret
+
+.globl _gcm_gmult_v8
+.private_extern _gcm_gmult_v8
+
+.align 4
+_gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x0] //load Xi
+ movi v19.16b,#0xe1
+ ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
+ shl v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v3.16b,v17.16b,v17.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+.globl _gcm_ghash_v8
+.private_extern _gcm_ghash_v8
+
+.align 4
+_gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ cmp x3,#64
+ b.hs Lgcm_ghash_v8_4x
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ //"[rotated]" means that
+ //loaded value would have
+ //to be rotated in order to
+ //make it appear as in
+ //algorithm specification
+ subs x3,x3,#32 //see if x3 is 32 or larger
+ mov x12,#16 //x12 is used as post-
+ //increment for input pointer;
+ //as loop is modulo-scheduled
+ //x12 is zeroed just in time
+ //to preclude overstepping
+ //inp[len], which means that
+ //last block[s] are actually
+ //loaded twice, but last
+ //copy is not processed
+ ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v22.2d},[x1]
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
+ ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+ rev64 v0.16b,v0.16b
+#endif
+ ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
+ b.lo Lodd_tail_v8 //x3 was less than 32
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v7.16b,v17.16b,v17.16b,#8
+ eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ pmull2 v6.1q,v20.2d,v7.2d
+ b Loop_mod2x_v8
+
+.align 4
+Loop_mod2x_v8:
+ ext v18.16b,v3.16b,v3.16b,#8
+ subs x3,x3,#32 //is there more data?
+ pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
+ csel x12,xzr,x12,lo //is it time to zero x12?
+
+ pmull v5.1q,v21.1d,v17.1d
+ eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
+ eor v0.16b,v0.16b,v4.16b //accumulate
+ pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
+
+ eor v2.16b,v2.16b,v6.16b
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ eor v1.16b,v1.16b,v5.16b
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+#endif
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v7.16b,v17.16b,v17.16b,#8
+ ext v3.16b,v16.16b,v16.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v3.16b,v3.16b,v18.16b
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ eor v3.16b,v3.16b,v0.16b
+ pmull2 v6.1q,v20.2d,v7.2d
+ b.hs Loop_mod2x_v8 //there was at least 32 more bytes
+
+ eor v2.16b,v2.16b,v18.16b
+ ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
+ adds x3,x3,#32 //re-construct x3
+ eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
+ b.eq Ldone_v8 //is x3 zero?
+Lodd_tail_v8:
+ ext v18.16b,v0.16b,v0.16b,#8
+ eor v3.16b,v3.16b,v0.16b //inp^=Xi
+ eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+
+.align 4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+ ext v25.16b,v7.16b,v7.16b,#8
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#128
+ b.lo Ltail4x
+
+ b Loop4x
+
+.align 4
+Loop4x:
+ eor v16.16b,v4.16b,v0.16b
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+ ext v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ ext v25.16b,v7.16b,v7.16b,#8
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ ext v24.16b,v6.16b,v6.16b,#8
+ eor v1.16b,v1.16b,v30.16b
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ eor v1.16b,v1.16b,v17.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ eor v1.16b,v1.16b,v18.16b
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ eor v0.16b,v1.16b,v18.16b
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v0.16b,v0.16b,v18.16b
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#64
+ b.hs Loop4x
+
+Ltail4x:
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+
+ adds x3,x3,#64
+ b.eq Ldone4x
+
+ cmp x3,#32
+ b.lo Lone
+ b.eq Ltwo
+Lthree:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d,v6.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ pmull v29.1q,v20.1d,v24.1d //H·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v31.1q,v20.2d,v24.2d
+ pmull v30.1q,v21.1d,v6.1d
+ eor v0.16b,v0.16b,v18.16b
+ pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull2 v23.1q,v22.2d,v23.2d
+ eor v16.16b,v4.16b,v0.16b
+ pmull2 v5.1q,v21.2d,v5.2d
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v26.2d,v3.2d
+ pmull v1.1q,v27.1d,v16.1d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b Ldone4x
+
+.align 4
+Ltwo:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull v29.1q,v20.1d,v23.1d //H·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull2 v31.1q,v20.2d,v23.2d
+ pmull v30.1q,v21.1d,v5.1d
+
+ pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v22.2d,v3.2d
+ pmull2 v1.1q,v21.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b Ldone4x
+
+.align 4
+Lone:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v20.2d,v3.2d
+ pmull v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/ghashv8-armv8-linux.S b/gen/bcm/ghashv8-armv8-linux.S
new file mode 100644
index 0000000..de6f712
--- /dev/null
+++ b/gen/bcm/ghashv8-armv8-linux.S
@@ -0,0 +1,565 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv8-a+crypto
+.globl gcm_init_v8
+.hidden gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x1] //load input H
+ movi v19.16b,#0xe1
+ shl v19.2d,v19.2d,#57 //0xc2.0
+ ext v3.16b,v17.16b,v17.16b,#8
+ ushr v18.2d,v19.2d,#63
+ dup v17.4s,v17.s[1]
+ ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v3.2d,#63
+ sshr v17.4s,v17.4s,#31 //broadcast carry bit
+ and v18.16b,v18.16b,v16.16b
+ shl v3.2d,v3.2d,#1
+ ext v18.16b,v18.16b,v18.16b,#8
+ and v16.16b,v16.16b,v17.16b
+ orr v3.16b,v3.16b,v18.16b //H<<<=1
+ eor v20.16b,v3.16b,v16.16b //twisted H
+ st1 {v20.2d},[x0],#16 //store Htable[0]
+
+ //calculate H^2
+ ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
+ pmull v0.1q,v20.1d,v20.1d
+ eor v16.16b,v16.16b,v20.16b
+ pmull2 v2.1q,v20.2d,v20.2d
+ pmull v1.1q,v16.1d,v16.1d
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v22.16b,v0.16b,v18.16b
+
+ ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
+ //calculate H^3 and H^4
+ pmull v0.1q,v20.1d, v22.1d
+ pmull v5.1q,v22.1d,v22.1d
+ pmull2 v2.1q,v20.2d, v22.2d
+ pmull2 v7.1q,v22.2d,v22.2d
+ pmull v1.1q,v16.1d,v17.1d
+ pmull v6.1q,v17.1d,v17.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v20.16b, v0.16b,v18.16b //H^3
+ eor v22.16b,v5.16b,v4.16b //H^4
+
+ ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v20.16b
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
+ ret
+.size gcm_init_v8,.-gcm_init_v8
+.globl gcm_gmult_v8
+.hidden gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x0] //load Xi
+ movi v19.16b,#0xe1
+ ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
+ shl v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v3.16b,v17.16b,v17.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_gmult_v8,.-gcm_gmult_v8
+.globl gcm_ghash_v8
+.hidden gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ cmp x3,#64
+ b.hs .Lgcm_ghash_v8_4x
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ //"[rotated]" means that
+ //loaded value would have
+ //to be rotated in order to
+ //make it appear as in
+ //algorithm specification
+ subs x3,x3,#32 //see if x3 is 32 or larger
+ mov x12,#16 //x12 is used as post-
+ //increment for input pointer;
+ //as loop is modulo-scheduled
+ //x12 is zeroed just in time
+ //to preclude overstepping
+ //inp[len], which means that
+ //last block[s] are actually
+ //loaded twice, but last
+ //copy is not processed
+ ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v22.2d},[x1]
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
+ ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+ rev64 v0.16b,v0.16b
+#endif
+ ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
+ b.lo .Lodd_tail_v8 //x3 was less than 32
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v7.16b,v17.16b,v17.16b,#8
+ eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ pmull2 v6.1q,v20.2d,v7.2d
+ b .Loop_mod2x_v8
+
+.align 4
+.Loop_mod2x_v8:
+ ext v18.16b,v3.16b,v3.16b,#8
+ subs x3,x3,#32 //is there more data?
+ pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
+ csel x12,xzr,x12,lo //is it time to zero x12?
+
+ pmull v5.1q,v21.1d,v17.1d
+ eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
+ eor v0.16b,v0.16b,v4.16b //accumulate
+ pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
+
+ eor v2.16b,v2.16b,v6.16b
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ eor v1.16b,v1.16b,v5.16b
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+#endif
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v7.16b,v17.16b,v17.16b,#8
+ ext v3.16b,v16.16b,v16.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v3.16b,v3.16b,v18.16b
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ eor v3.16b,v3.16b,v0.16b
+ pmull2 v6.1q,v20.2d,v7.2d
+ b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
+
+ eor v2.16b,v2.16b,v18.16b
+ ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
+ adds x3,x3,#32 //re-construct x3
+ eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
+ b.eq .Ldone_v8 //is x3 zero?
+.Lodd_tail_v8:
+ ext v18.16b,v0.16b,v0.16b,#8
+ eor v3.16b,v3.16b,v0.16b //inp^=Xi
+ eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+.Ldone_v8:
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_ghash_v8,.-gcm_ghash_v8
+.type gcm_ghash_v8_4x,%function
+.align 4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+ ext v25.16b,v7.16b,v7.16b,#8
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#128
+ b.lo .Ltail4x
+
+ b .Loop4x
+
+.align 4
+.Loop4x:
+ eor v16.16b,v4.16b,v0.16b
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+ ext v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ ext v25.16b,v7.16b,v7.16b,#8
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ ext v24.16b,v6.16b,v6.16b,#8
+ eor v1.16b,v1.16b,v30.16b
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ eor v1.16b,v1.16b,v17.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ eor v1.16b,v1.16b,v18.16b
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ eor v0.16b,v1.16b,v18.16b
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v0.16b,v0.16b,v18.16b
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#64
+ b.hs .Loop4x
+
+.Ltail4x:
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+
+ adds x3,x3,#64
+ b.eq .Ldone4x
+
+ cmp x3,#32
+ b.lo .Lone
+ b.eq .Ltwo
+.Lthree:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d,v6.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ pmull v29.1q,v20.1d,v24.1d //H·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v31.1q,v20.2d,v24.2d
+ pmull v30.1q,v21.1d,v6.1d
+ eor v0.16b,v0.16b,v18.16b
+ pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull2 v23.1q,v22.2d,v23.2d
+ eor v16.16b,v4.16b,v0.16b
+ pmull2 v5.1q,v21.2d,v5.2d
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v26.2d,v3.2d
+ pmull v1.1q,v27.1d,v16.1d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b .Ldone4x
+
+.align 4
+.Ltwo:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull v29.1q,v20.1d,v23.1d //H·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull2 v31.1q,v20.2d,v23.2d
+ pmull v30.1q,v21.1d,v5.1d
+
+ pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v22.2d,v3.2d
+ pmull2 v1.1q,v21.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b .Ldone4x
+
+.align 4
+.Lone:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v20.2d,v3.2d
+ pmull v1.1q,v21.1d,v16.1d
+
+.Ldone4x:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/ghashv8-armv8-win.S b/gen/bcm/ghashv8-armv8-win.S
new file mode 100644
index 0000000..0be9ac6
--- /dev/null
+++ b/gen/bcm/ghashv8-armv8-win.S
@@ -0,0 +1,573 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv8-a+crypto
+.globl gcm_init_v8
+
+.def gcm_init_v8
+ .type 32
+.endef
+.align 4
+gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x1] //load input H
+ movi v19.16b,#0xe1
+ shl v19.2d,v19.2d,#57 //0xc2.0
+ ext v3.16b,v17.16b,v17.16b,#8
+ ushr v18.2d,v19.2d,#63
+ dup v17.4s,v17.s[1]
+ ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v3.2d,#63
+ sshr v17.4s,v17.4s,#31 //broadcast carry bit
+ and v18.16b,v18.16b,v16.16b
+ shl v3.2d,v3.2d,#1
+ ext v18.16b,v18.16b,v18.16b,#8
+ and v16.16b,v16.16b,v17.16b
+ orr v3.16b,v3.16b,v18.16b //H<<<=1
+ eor v20.16b,v3.16b,v16.16b //twisted H
+ st1 {v20.2d},[x0],#16 //store Htable[0]
+
+ //calculate H^2
+ ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
+ pmull v0.1q,v20.1d,v20.1d
+ eor v16.16b,v16.16b,v20.16b
+ pmull2 v2.1q,v20.2d,v20.2d
+ pmull v1.1q,v16.1d,v16.1d
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v22.16b,v0.16b,v18.16b
+
+ ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
+ //calculate H^3 and H^4
+ pmull v0.1q,v20.1d, v22.1d
+ pmull v5.1q,v22.1d,v22.1d
+ pmull2 v2.1q,v20.2d, v22.2d
+ pmull2 v7.1q,v22.2d,v22.2d
+ pmull v1.1q,v16.1d,v17.1d
+ pmull v6.1q,v17.1d,v17.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v20.16b, v0.16b,v18.16b //H^3
+ eor v22.16b,v5.16b,v4.16b //H^4
+
+ ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v20.16b
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
+ ret
+
+.globl gcm_gmult_v8
+
+.def gcm_gmult_v8
+ .type 32
+.endef
+.align 4
+gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x0] //load Xi
+ movi v19.16b,#0xe1
+ ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
+ shl v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v3.16b,v17.16b,v17.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+.globl gcm_ghash_v8
+
+.def gcm_ghash_v8
+ .type 32
+.endef
+.align 4
+gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ cmp x3,#64
+ b.hs Lgcm_ghash_v8_4x
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ //"[rotated]" means that
+ //loaded value would have
+ //to be rotated in order to
+ //make it appear as in
+ //algorithm specification
+ subs x3,x3,#32 //see if x3 is 32 or larger
+ mov x12,#16 //x12 is used as post-
+ //increment for input pointer;
+ //as loop is modulo-scheduled
+ //x12 is zeroed just in time
+ //to preclude overstepping
+ //inp[len], which means that
+ //last block[s] are actually
+ //loaded twice, but last
+ //copy is not processed
+ ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v22.2d},[x1]
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
+ ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+ rev64 v0.16b,v0.16b
+#endif
+ ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
+ b.lo Lodd_tail_v8 //x3 was less than 32
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v7.16b,v17.16b,v17.16b,#8
+ eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ pmull2 v6.1q,v20.2d,v7.2d
+ b Loop_mod2x_v8
+
+.align 4
+Loop_mod2x_v8:
+ ext v18.16b,v3.16b,v3.16b,#8
+ subs x3,x3,#32 //is there more data?
+ pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
+ csel x12,xzr,x12,lo //is it time to zero x12?
+
+ pmull v5.1q,v21.1d,v17.1d
+ eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
+ eor v0.16b,v0.16b,v4.16b //accumulate
+ pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
+
+ eor v2.16b,v2.16b,v6.16b
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ eor v1.16b,v1.16b,v5.16b
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+#endif
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v7.16b,v17.16b,v17.16b,#8
+ ext v3.16b,v16.16b,v16.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v3.16b,v3.16b,v18.16b
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ eor v3.16b,v3.16b,v0.16b
+ pmull2 v6.1q,v20.2d,v7.2d
+ b.hs Loop_mod2x_v8 //there was at least 32 more bytes
+
+ eor v2.16b,v2.16b,v18.16b
+ ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
+ adds x3,x3,#32 //re-construct x3
+ eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
+ b.eq Ldone_v8 //is x3 zero?
+Lodd_tail_v8:
+ ext v18.16b,v0.16b,v0.16b,#8
+ eor v3.16b,v3.16b,v0.16b //inp^=Xi
+ eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+.def gcm_ghash_v8_4x
+ .type 32
+.endef
+.align 4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+ ext v25.16b,v7.16b,v7.16b,#8
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#128
+ b.lo Ltail4x
+
+ b Loop4x
+
+.align 4
+Loop4x:
+ eor v16.16b,v4.16b,v0.16b
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+ ext v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ ext v25.16b,v7.16b,v7.16b,#8
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ ext v24.16b,v6.16b,v6.16b,#8
+ eor v1.16b,v1.16b,v30.16b
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ eor v1.16b,v1.16b,v17.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ eor v1.16b,v1.16b,v18.16b
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ eor v0.16b,v1.16b,v18.16b
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v0.16b,v0.16b,v18.16b
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#64
+ b.hs Loop4x
+
+Ltail4x:
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+
+ adds x3,x3,#64
+ b.eq Ldone4x
+
+ cmp x3,#32
+ b.lo Lone
+ b.eq Ltwo
+Lthree:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d,v6.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ pmull v29.1q,v20.1d,v24.1d //H·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v31.1q,v20.2d,v24.2d
+ pmull v30.1q,v21.1d,v6.1d
+ eor v0.16b,v0.16b,v18.16b
+ pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull2 v23.1q,v22.2d,v23.2d
+ eor v16.16b,v4.16b,v0.16b
+ pmull2 v5.1q,v21.2d,v5.2d
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v26.2d,v3.2d
+ pmull v1.1q,v27.1d,v16.1d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b Ldone4x
+
+.align 4
+Ltwo:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull v29.1q,v20.1d,v23.1d //H·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull2 v31.1q,v20.2d,v23.2d
+ pmull v30.1q,v21.1d,v5.1d
+
+ pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v22.2d,v3.2d
+ pmull2 v1.1q,v21.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b Ldone4x
+
+.align 4
+Lone:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v20.2d,v3.2d
+ pmull v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/md5-586-apple.S b/gen/bcm/md5-586-apple.S
new file mode 100644
index 0000000..986d590
--- /dev/null
+++ b/gen/bcm/md5-586-apple.S
@@ -0,0 +1,684 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _md5_block_asm_data_order
+.private_extern _md5_block_asm_data_order
+.align 4
+_md5_block_asm_data_order:
+L_md5_block_asm_data_order_begin:
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%edi
+ movl 16(%esp),%esi
+ movl 20(%esp),%ecx
+ pushl %ebp
+ shll $6,%ecx
+ pushl %ebx
+ addl %esi,%ecx
+ subl $64,%ecx
+ movl (%edi),%eax
+ pushl %ecx
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+L000start:
+
+ # R0 section
+ movl %ecx,%edi
+ movl (%esi),%ebp
+ # R0 0
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 3614090360(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 4(%esi),%ebp
+ addl %ebx,%eax
+ # R0 1
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 3905402710(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 8(%esi),%ebp
+ addl %eax,%edx
+ # R0 2
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 606105819(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 12(%esi),%ebp
+ addl %edx,%ecx
+ # R0 3
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 3250441966(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 16(%esi),%ebp
+ addl %ecx,%ebx
+ # R0 4
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 4118548399(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 20(%esi),%ebp
+ addl %ebx,%eax
+ # R0 5
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 1200080426(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 24(%esi),%ebp
+ addl %eax,%edx
+ # R0 6
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2821735955(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 28(%esi),%ebp
+ addl %edx,%ecx
+ # R0 7
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 4249261313(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 32(%esi),%ebp
+ addl %ecx,%ebx
+ # R0 8
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1770035416(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 36(%esi),%ebp
+ addl %ebx,%eax
+ # R0 9
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 2336552879(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 40(%esi),%ebp
+ addl %eax,%edx
+ # R0 10
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 4294925233(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 44(%esi),%ebp
+ addl %edx,%ecx
+ # R0 11
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 2304563134(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 48(%esi),%ebp
+ addl %ecx,%ebx
+ # R0 12
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1804603682(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 52(%esi),%ebp
+ addl %ebx,%eax
+ # R0 13
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 4254626195(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 56(%esi),%ebp
+ addl %eax,%edx
+ # R0 14
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2792965006(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 60(%esi),%ebp
+ addl %edx,%ecx
+ # R0 15
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 1236535329(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 4(%esi),%ebp
+ addl %ecx,%ebx
+
+ # R1 section
+ # R1 16
+ leal 4129170786(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 24(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+ # R1 17
+ leal 3225465664(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 44(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+ # R1 18
+ leal 643717713(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl (%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+ # R1 19
+ leal 3921069994(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+ # R1 20
+ leal 3593408605(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 40(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+ # R1 21
+ leal 38016083(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 60(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+ # R1 22
+ leal 3634488961(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 16(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+ # R1 23
+ leal 3889429448(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 36(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+ # R1 24
+ leal 568446438(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 56(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+ # R1 25
+ leal 3275163606(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 12(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+ # R1 26
+ leal 4107603335(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 32(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+ # R1 27
+ leal 1163531501(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 52(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+ # R1 28
+ leal 2850285829(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 8(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+ # R1 29
+ leal 4243563512(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 28(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+ # R1 30
+ leal 1735328473(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 48(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+ # R1 31
+ leal 2368359562(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ # R2 section
+ # R2 32
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 4294588738(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 32(%esi),%ebp
+ movl %ebx,%edi
+ # R2 33
+ leal 2272392833(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+ # R2 34
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 1839030562(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 56(%esi),%ebp
+ movl %edx,%edi
+ # R2 35
+ leal 4259657740(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+ # R2 36
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 2763975236(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 16(%esi),%ebp
+ movl %ebx,%edi
+ # R2 37
+ leal 1272893353(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+ # R2 38
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 4139469664(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 40(%esi),%ebp
+ movl %edx,%edi
+ # R2 39
+ leal 3200236656(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+ # R2 40
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 681279174(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl (%esi),%ebp
+ movl %ebx,%edi
+ # R2 41
+ leal 3936430074(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+ # R2 42
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 3572445317(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 24(%esi),%ebp
+ movl %edx,%edi
+ # R2 43
+ leal 76029189(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+ # R2 44
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 3654602809(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 48(%esi),%ebp
+ movl %ebx,%edi
+ # R2 45
+ leal 3873151461(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+ # R2 46
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 530742520(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 8(%esi),%ebp
+ movl %edx,%edi
+ # R2 47
+ leal 3299628645(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl (%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ # R3 section
+ # R3 48
+ xorl %edx,%edi
+ orl %ebx,%edi
+ leal 4096336452(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+ # R3 49
+ orl %eax,%edi
+ leal 1126891415(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 56(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+ # R3 50
+ orl %edx,%edi
+ leal 2878612391(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 20(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ # R3 51
+ orl %ecx,%edi
+ leal 4237533241(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 48(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+ # R3 52
+ orl %ebx,%edi
+ leal 1700485571(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+ # R3 53
+ orl %eax,%edi
+ leal 2399980690(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 40(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+ # R3 54
+ orl %edx,%edi
+ leal 4293915773(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ # R3 55
+ orl %ecx,%edi
+ leal 2240044497(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 32(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+ # R3 56
+ orl %ebx,%edi
+ leal 1873313359(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+ # R3 57
+ orl %eax,%edi
+ leal 4264355552(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 24(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+ # R3 58
+ orl %edx,%edi
+ leal 2734768916(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ # R3 59
+ orl %ecx,%edi
+ leal 1309151649(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 16(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+ # R3 60
+ orl %ebx,%edi
+ leal 4149444226(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+ # R3 61
+ orl %eax,%edi
+ leal 3174756917(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 8(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+ # R3 62
+ orl %edx,%edi
+ leal 718787259(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ # R3 63
+ orl %ecx,%edi
+ leal 3951481745(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 24(%esp),%ebp
+ addl %edi,%ebx
+ addl $64,%esi
+ roll $21,%ebx
+ movl (%ebp),%edi
+ addl %ecx,%ebx
+ addl %edi,%eax
+ movl 4(%ebp),%edi
+ addl %edi,%ebx
+ movl 8(%ebp),%edi
+ addl %edi,%ecx
+ movl 12(%ebp),%edi
+ addl %edi,%edx
+ movl %eax,(%ebp)
+ movl %ebx,4(%ebp)
+ movl (%esp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ cmpl %esi,%edi
+ jae L000start
+ popl %eax
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/md5-586-linux.S b/gen/bcm/md5-586-linux.S
new file mode 100644
index 0000000..a297f2b
--- /dev/null
+++ b/gen/bcm/md5-586-linux.S
@@ -0,0 +1,686 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl md5_block_asm_data_order
+.hidden md5_block_asm_data_order
+.type md5_block_asm_data_order,@function
+.align 16
+md5_block_asm_data_order:
+.L_md5_block_asm_data_order_begin:
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%edi
+ movl 16(%esp),%esi
+ movl 20(%esp),%ecx
+ pushl %ebp
+ shll $6,%ecx
+ pushl %ebx
+ addl %esi,%ecx
+ subl $64,%ecx
+ movl (%edi),%eax
+ pushl %ecx
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+.L000start:
+
+
+ movl %ecx,%edi
+ movl (%esi),%ebp
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 3614090360(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 4(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 3905402710(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 8(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 606105819(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 12(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 3250441966(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 16(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 4118548399(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 20(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 1200080426(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 24(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2821735955(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 28(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 4249261313(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 32(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1770035416(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 36(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 2336552879(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 40(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 4294925233(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 44(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 2304563134(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 48(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1804603682(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 52(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 4254626195(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 56(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2792965006(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 60(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 1236535329(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 4(%esi),%ebp
+ addl %ecx,%ebx
+
+
+
+ leal 4129170786(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 24(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 3225465664(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 44(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 643717713(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl (%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 3921069994(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 3593408605(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 40(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 38016083(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 60(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 3634488961(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 16(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 3889429448(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 36(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 568446438(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 56(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 3275163606(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 12(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 4107603335(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 32(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 1163531501(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 52(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 2850285829(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 8(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 4243563512(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 28(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 1735328473(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 48(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 2368359562(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 4294588738(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 32(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 2272392833(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 1839030562(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 56(%esi),%ebp
+ movl %edx,%edi
+
+ leal 4259657740(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 2763975236(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 16(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 1272893353(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 4139469664(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 40(%esi),%ebp
+ movl %edx,%edi
+
+ leal 3200236656(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 681279174(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl (%esi),%ebp
+ movl %ebx,%edi
+
+ leal 3936430074(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 3572445317(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 24(%esi),%ebp
+ movl %edx,%edi
+
+ leal 76029189(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 3654602809(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 48(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 3873151461(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 530742520(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 8(%esi),%ebp
+ movl %edx,%edi
+
+ leal 3299628645(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl (%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+
+
+ xorl %edx,%edi
+ orl %ebx,%edi
+ leal 4096336452(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 1126891415(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 56(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 2878612391(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 20(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 4237533241(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 48(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 1700485571(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 2399980690(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 40(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 4293915773(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 2240044497(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 32(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 1873313359(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 4264355552(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 24(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 2734768916(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 1309151649(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 16(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 4149444226(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 3174756917(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 8(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 718787259(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 3951481745(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 24(%esp),%ebp
+ addl %edi,%ebx
+ addl $64,%esi
+ roll $21,%ebx
+ movl (%ebp),%edi
+ addl %ecx,%ebx
+ addl %edi,%eax
+ movl 4(%ebp),%edi
+ addl %edi,%ebx
+ movl 8(%ebp),%edi
+ addl %edi,%ecx
+ movl 12(%ebp),%edi
+ addl %edi,%edx
+ movl %eax,(%ebp)
+ movl %ebx,4(%ebp)
+ movl (%esp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ cmpl %esi,%edi
+ jae .L000start
+ popl %eax
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/md5-586-win.asm b/gen/bcm/md5-586-win.asm
new file mode 100644
index 0000000..25592b8
--- /dev/null
+++ b/gen/bcm/md5-586-win.asm
@@ -0,0 +1,694 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _md5_block_asm_data_order
+align 16
+_md5_block_asm_data_order:
+L$_md5_block_asm_data_order_begin:
+ push esi
+ push edi
+ mov edi,DWORD [12+esp]
+ mov esi,DWORD [16+esp]
+ mov ecx,DWORD [20+esp]
+ push ebp
+ shl ecx,6
+ push ebx
+ add ecx,esi
+ sub ecx,64
+ mov eax,DWORD [edi]
+ push ecx
+ mov ebx,DWORD [4+edi]
+ mov ecx,DWORD [8+edi]
+ mov edx,DWORD [12+edi]
+L$000start:
+ ;
+ ; R0 section
+ mov edi,ecx
+ mov ebp,DWORD [esi]
+ ; R0 0
+ xor edi,edx
+ and edi,ebx
+ lea eax,[3614090360+ebp*1+eax]
+ xor edi,edx
+ add eax,edi
+ mov edi,ebx
+ rol eax,7
+ mov ebp,DWORD [4+esi]
+ add eax,ebx
+ ; R0 1
+ xor edi,ecx
+ and edi,eax
+ lea edx,[3905402710+ebp*1+edx]
+ xor edi,ecx
+ add edx,edi
+ mov edi,eax
+ rol edx,12
+ mov ebp,DWORD [8+esi]
+ add edx,eax
+ ; R0 2
+ xor edi,ebx
+ and edi,edx
+ lea ecx,[606105819+ebp*1+ecx]
+ xor edi,ebx
+ add ecx,edi
+ mov edi,edx
+ rol ecx,17
+ mov ebp,DWORD [12+esi]
+ add ecx,edx
+ ; R0 3
+ xor edi,eax
+ and edi,ecx
+ lea ebx,[3250441966+ebp*1+ebx]
+ xor edi,eax
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,22
+ mov ebp,DWORD [16+esi]
+ add ebx,ecx
+ ; R0 4
+ xor edi,edx
+ and edi,ebx
+ lea eax,[4118548399+ebp*1+eax]
+ xor edi,edx
+ add eax,edi
+ mov edi,ebx
+ rol eax,7
+ mov ebp,DWORD [20+esi]
+ add eax,ebx
+ ; R0 5
+ xor edi,ecx
+ and edi,eax
+ lea edx,[1200080426+ebp*1+edx]
+ xor edi,ecx
+ add edx,edi
+ mov edi,eax
+ rol edx,12
+ mov ebp,DWORD [24+esi]
+ add edx,eax
+ ; R0 6
+ xor edi,ebx
+ and edi,edx
+ lea ecx,[2821735955+ebp*1+ecx]
+ xor edi,ebx
+ add ecx,edi
+ mov edi,edx
+ rol ecx,17
+ mov ebp,DWORD [28+esi]
+ add ecx,edx
+ ; R0 7
+ xor edi,eax
+ and edi,ecx
+ lea ebx,[4249261313+ebp*1+ebx]
+ xor edi,eax
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,22
+ mov ebp,DWORD [32+esi]
+ add ebx,ecx
+ ; R0 8
+ xor edi,edx
+ and edi,ebx
+ lea eax,[1770035416+ebp*1+eax]
+ xor edi,edx
+ add eax,edi
+ mov edi,ebx
+ rol eax,7
+ mov ebp,DWORD [36+esi]
+ add eax,ebx
+ ; R0 9
+ xor edi,ecx
+ and edi,eax
+ lea edx,[2336552879+ebp*1+edx]
+ xor edi,ecx
+ add edx,edi
+ mov edi,eax
+ rol edx,12
+ mov ebp,DWORD [40+esi]
+ add edx,eax
+ ; R0 10
+ xor edi,ebx
+ and edi,edx
+ lea ecx,[4294925233+ebp*1+ecx]
+ xor edi,ebx
+ add ecx,edi
+ mov edi,edx
+ rol ecx,17
+ mov ebp,DWORD [44+esi]
+ add ecx,edx
+ ; R0 11
+ xor edi,eax
+ and edi,ecx
+ lea ebx,[2304563134+ebp*1+ebx]
+ xor edi,eax
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,22
+ mov ebp,DWORD [48+esi]
+ add ebx,ecx
+ ; R0 12
+ xor edi,edx
+ and edi,ebx
+ lea eax,[1804603682+ebp*1+eax]
+ xor edi,edx
+ add eax,edi
+ mov edi,ebx
+ rol eax,7
+ mov ebp,DWORD [52+esi]
+ add eax,ebx
+ ; R0 13
+ xor edi,ecx
+ and edi,eax
+ lea edx,[4254626195+ebp*1+edx]
+ xor edi,ecx
+ add edx,edi
+ mov edi,eax
+ rol edx,12
+ mov ebp,DWORD [56+esi]
+ add edx,eax
+ ; R0 14
+ xor edi,ebx
+ and edi,edx
+ lea ecx,[2792965006+ebp*1+ecx]
+ xor edi,ebx
+ add ecx,edi
+ mov edi,edx
+ rol ecx,17
+ mov ebp,DWORD [60+esi]
+ add ecx,edx
+ ; R0 15
+ xor edi,eax
+ and edi,ecx
+ lea ebx,[1236535329+ebp*1+ebx]
+ xor edi,eax
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,22
+ mov ebp,DWORD [4+esi]
+ add ebx,ecx
+ ;
+ ; R1 section
+ ; R1 16
+ lea eax,[4129170786+ebp*1+eax]
+ xor edi,ebx
+ and edi,edx
+ mov ebp,DWORD [24+esi]
+ xor edi,ecx
+ add eax,edi
+ mov edi,ebx
+ rol eax,5
+ add eax,ebx
+ ; R1 17
+ lea edx,[3225465664+ebp*1+edx]
+ xor edi,eax
+ and edi,ecx
+ mov ebp,DWORD [44+esi]
+ xor edi,ebx
+ add edx,edi
+ mov edi,eax
+ rol edx,9
+ add edx,eax
+ ; R1 18
+ lea ecx,[643717713+ebp*1+ecx]
+ xor edi,edx
+ and edi,ebx
+ mov ebp,DWORD [esi]
+ xor edi,eax
+ add ecx,edi
+ mov edi,edx
+ rol ecx,14
+ add ecx,edx
+ ; R1 19
+ lea ebx,[3921069994+ebp*1+ebx]
+ xor edi,ecx
+ and edi,eax
+ mov ebp,DWORD [20+esi]
+ xor edi,edx
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,20
+ add ebx,ecx
+ ; R1 20
+ lea eax,[3593408605+ebp*1+eax]
+ xor edi,ebx
+ and edi,edx
+ mov ebp,DWORD [40+esi]
+ xor edi,ecx
+ add eax,edi
+ mov edi,ebx
+ rol eax,5
+ add eax,ebx
+ ; R1 21
+ lea edx,[38016083+ebp*1+edx]
+ xor edi,eax
+ and edi,ecx
+ mov ebp,DWORD [60+esi]
+ xor edi,ebx
+ add edx,edi
+ mov edi,eax
+ rol edx,9
+ add edx,eax
+ ; R1 22
+ lea ecx,[3634488961+ebp*1+ecx]
+ xor edi,edx
+ and edi,ebx
+ mov ebp,DWORD [16+esi]
+ xor edi,eax
+ add ecx,edi
+ mov edi,edx
+ rol ecx,14
+ add ecx,edx
+ ; R1 23
+ lea ebx,[3889429448+ebp*1+ebx]
+ xor edi,ecx
+ and edi,eax
+ mov ebp,DWORD [36+esi]
+ xor edi,edx
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,20
+ add ebx,ecx
+ ; R1 24
+ lea eax,[568446438+ebp*1+eax]
+ xor edi,ebx
+ and edi,edx
+ mov ebp,DWORD [56+esi]
+ xor edi,ecx
+ add eax,edi
+ mov edi,ebx
+ rol eax,5
+ add eax,ebx
+ ; R1 25
+ lea edx,[3275163606+ebp*1+edx]
+ xor edi,eax
+ and edi,ecx
+ mov ebp,DWORD [12+esi]
+ xor edi,ebx
+ add edx,edi
+ mov edi,eax
+ rol edx,9
+ add edx,eax
+ ; R1 26
+ lea ecx,[4107603335+ebp*1+ecx]
+ xor edi,edx
+ and edi,ebx
+ mov ebp,DWORD [32+esi]
+ xor edi,eax
+ add ecx,edi
+ mov edi,edx
+ rol ecx,14
+ add ecx,edx
+ ; R1 27
+ lea ebx,[1163531501+ebp*1+ebx]
+ xor edi,ecx
+ and edi,eax
+ mov ebp,DWORD [52+esi]
+ xor edi,edx
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,20
+ add ebx,ecx
+ ; R1 28
+ lea eax,[2850285829+ebp*1+eax]
+ xor edi,ebx
+ and edi,edx
+ mov ebp,DWORD [8+esi]
+ xor edi,ecx
+ add eax,edi
+ mov edi,ebx
+ rol eax,5
+ add eax,ebx
+ ; R1 29
+ lea edx,[4243563512+ebp*1+edx]
+ xor edi,eax
+ and edi,ecx
+ mov ebp,DWORD [28+esi]
+ xor edi,ebx
+ add edx,edi
+ mov edi,eax
+ rol edx,9
+ add edx,eax
+ ; R1 30
+ lea ecx,[1735328473+ebp*1+ecx]
+ xor edi,edx
+ and edi,ebx
+ mov ebp,DWORD [48+esi]
+ xor edi,eax
+ add ecx,edi
+ mov edi,edx
+ rol ecx,14
+ add ecx,edx
+ ; R1 31
+ lea ebx,[2368359562+ebp*1+ebx]
+ xor edi,ecx
+ and edi,eax
+ mov ebp,DWORD [20+esi]
+ xor edi,edx
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,20
+ add ebx,ecx
+ ;
+ ; R2 section
+ ; R2 32
+ xor edi,edx
+ xor edi,ebx
+ lea eax,[4294588738+ebp*1+eax]
+ add eax,edi
+ rol eax,4
+ mov ebp,DWORD [32+esi]
+ mov edi,ebx
+ ; R2 33
+ lea edx,[2272392833+ebp*1+edx]
+ add eax,ebx
+ xor edi,ecx
+ xor edi,eax
+ mov ebp,DWORD [44+esi]
+ add edx,edi
+ mov edi,eax
+ rol edx,11
+ add edx,eax
+ ; R2 34
+ xor edi,ebx
+ xor edi,edx
+ lea ecx,[1839030562+ebp*1+ecx]
+ add ecx,edi
+ rol ecx,16
+ mov ebp,DWORD [56+esi]
+ mov edi,edx
+ ; R2 35
+ lea ebx,[4259657740+ebp*1+ebx]
+ add ecx,edx
+ xor edi,eax
+ xor edi,ecx
+ mov ebp,DWORD [4+esi]
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,23
+ add ebx,ecx
+ ; R2 36
+ xor edi,edx
+ xor edi,ebx
+ lea eax,[2763975236+ebp*1+eax]
+ add eax,edi
+ rol eax,4
+ mov ebp,DWORD [16+esi]
+ mov edi,ebx
+ ; R2 37
+ lea edx,[1272893353+ebp*1+edx]
+ add eax,ebx
+ xor edi,ecx
+ xor edi,eax
+ mov ebp,DWORD [28+esi]
+ add edx,edi
+ mov edi,eax
+ rol edx,11
+ add edx,eax
+ ; R2 38
+ xor edi,ebx
+ xor edi,edx
+ lea ecx,[4139469664+ebp*1+ecx]
+ add ecx,edi
+ rol ecx,16
+ mov ebp,DWORD [40+esi]
+ mov edi,edx
+ ; R2 39
+ lea ebx,[3200236656+ebp*1+ebx]
+ add ecx,edx
+ xor edi,eax
+ xor edi,ecx
+ mov ebp,DWORD [52+esi]
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,23
+ add ebx,ecx
+ ; R2 40
+ xor edi,edx
+ xor edi,ebx
+ lea eax,[681279174+ebp*1+eax]
+ add eax,edi
+ rol eax,4
+ mov ebp,DWORD [esi]
+ mov edi,ebx
+ ; R2 41
+ lea edx,[3936430074+ebp*1+edx]
+ add eax,ebx
+ xor edi,ecx
+ xor edi,eax
+ mov ebp,DWORD [12+esi]
+ add edx,edi
+ mov edi,eax
+ rol edx,11
+ add edx,eax
+ ; R2 42
+ xor edi,ebx
+ xor edi,edx
+ lea ecx,[3572445317+ebp*1+ecx]
+ add ecx,edi
+ rol ecx,16
+ mov ebp,DWORD [24+esi]
+ mov edi,edx
+ ; R2 43
+ lea ebx,[76029189+ebp*1+ebx]
+ add ecx,edx
+ xor edi,eax
+ xor edi,ecx
+ mov ebp,DWORD [36+esi]
+ add ebx,edi
+ mov edi,ecx
+ rol ebx,23
+ add ebx,ecx
+ ; R2 44
+ xor edi,edx
+ xor edi,ebx
+ lea eax,[3654602809+ebp*1+eax]
+ add eax,edi
+ rol eax,4
+ mov ebp,DWORD [48+esi]
+ mov edi,ebx
+ ; R2 45
+ lea edx,[3873151461+ebp*1+edx]
+ add eax,ebx
+ xor edi,ecx
+ xor edi,eax
+ mov ebp,DWORD [60+esi]
+ add edx,edi
+ mov edi,eax
+ rol edx,11
+ add edx,eax
+ ; R2 46
+ xor edi,ebx
+ xor edi,edx
+ lea ecx,[530742520+ebp*1+ecx]
+ add ecx,edi
+ rol ecx,16
+ mov ebp,DWORD [8+esi]
+ mov edi,edx
+ ; R2 47
+ lea ebx,[3299628645+ebp*1+ebx]
+ add ecx,edx
+ xor edi,eax
+ xor edi,ecx
+ mov ebp,DWORD [esi]
+ add ebx,edi
+ mov edi,-1
+ rol ebx,23
+ add ebx,ecx
+ ;
+ ; R3 section
+ ; R3 48
+ xor edi,edx
+ or edi,ebx
+ lea eax,[4096336452+ebp*1+eax]
+ xor edi,ecx
+ mov ebp,DWORD [28+esi]
+ add eax,edi
+ mov edi,-1
+ rol eax,6
+ xor edi,ecx
+ add eax,ebx
+ ; R3 49
+ or edi,eax
+ lea edx,[1126891415+ebp*1+edx]
+ xor edi,ebx
+ mov ebp,DWORD [56+esi]
+ add edx,edi
+ mov edi,-1
+ rol edx,10
+ xor edi,ebx
+ add edx,eax
+ ; R3 50
+ or edi,edx
+ lea ecx,[2878612391+ebp*1+ecx]
+ xor edi,eax
+ mov ebp,DWORD [20+esi]
+ add ecx,edi
+ mov edi,-1
+ rol ecx,15
+ xor edi,eax
+ add ecx,edx
+ ; R3 51
+ or edi,ecx
+ lea ebx,[4237533241+ebp*1+ebx]
+ xor edi,edx
+ mov ebp,DWORD [48+esi]
+ add ebx,edi
+ mov edi,-1
+ rol ebx,21
+ xor edi,edx
+ add ebx,ecx
+ ; R3 52
+ or edi,ebx
+ lea eax,[1700485571+ebp*1+eax]
+ xor edi,ecx
+ mov ebp,DWORD [12+esi]
+ add eax,edi
+ mov edi,-1
+ rol eax,6
+ xor edi,ecx
+ add eax,ebx
+ ; R3 53
+ or edi,eax
+ lea edx,[2399980690+ebp*1+edx]
+ xor edi,ebx
+ mov ebp,DWORD [40+esi]
+ add edx,edi
+ mov edi,-1
+ rol edx,10
+ xor edi,ebx
+ add edx,eax
+ ; R3 54
+ or edi,edx
+ lea ecx,[4293915773+ebp*1+ecx]
+ xor edi,eax
+ mov ebp,DWORD [4+esi]
+ add ecx,edi
+ mov edi,-1
+ rol ecx,15
+ xor edi,eax
+ add ecx,edx
+ ; R3 55
+ or edi,ecx
+ lea ebx,[2240044497+ebp*1+ebx]
+ xor edi,edx
+ mov ebp,DWORD [32+esi]
+ add ebx,edi
+ mov edi,-1
+ rol ebx,21
+ xor edi,edx
+ add ebx,ecx
+ ; R3 56
+ or edi,ebx
+ lea eax,[1873313359+ebp*1+eax]
+ xor edi,ecx
+ mov ebp,DWORD [60+esi]
+ add eax,edi
+ mov edi,-1
+ rol eax,6
+ xor edi,ecx
+ add eax,ebx
+ ; R3 57
+ or edi,eax
+ lea edx,[4264355552+ebp*1+edx]
+ xor edi,ebx
+ mov ebp,DWORD [24+esi]
+ add edx,edi
+ mov edi,-1
+ rol edx,10
+ xor edi,ebx
+ add edx,eax
+ ; R3 58
+ or edi,edx
+ lea ecx,[2734768916+ebp*1+ecx]
+ xor edi,eax
+ mov ebp,DWORD [52+esi]
+ add ecx,edi
+ mov edi,-1
+ rol ecx,15
+ xor edi,eax
+ add ecx,edx
+ ; R3 59
+ or edi,ecx
+ lea ebx,[1309151649+ebp*1+ebx]
+ xor edi,edx
+ mov ebp,DWORD [16+esi]
+ add ebx,edi
+ mov edi,-1
+ rol ebx,21
+ xor edi,edx
+ add ebx,ecx
+ ; R3 60
+ or edi,ebx
+ lea eax,[4149444226+ebp*1+eax]
+ xor edi,ecx
+ mov ebp,DWORD [44+esi]
+ add eax,edi
+ mov edi,-1
+ rol eax,6
+ xor edi,ecx
+ add eax,ebx
+ ; R3 61
+ or edi,eax
+ lea edx,[3174756917+ebp*1+edx]
+ xor edi,ebx
+ mov ebp,DWORD [8+esi]
+ add edx,edi
+ mov edi,-1
+ rol edx,10
+ xor edi,ebx
+ add edx,eax
+ ; R3 62
+ or edi,edx
+ lea ecx,[718787259+ebp*1+ecx]
+ xor edi,eax
+ mov ebp,DWORD [36+esi]
+ add ecx,edi
+ mov edi,-1
+ rol ecx,15
+ xor edi,eax
+ add ecx,edx
+ ; R3 63
+ or edi,ecx
+ lea ebx,[3951481745+ebp*1+ebx]
+ xor edi,edx
+ mov ebp,DWORD [24+esp]
+ add ebx,edi
+ add esi,64
+ rol ebx,21
+ mov edi,DWORD [ebp]
+ add ebx,ecx
+ add eax,edi
+ mov edi,DWORD [4+ebp]
+ add ebx,edi
+ mov edi,DWORD [8+ebp]
+ add ecx,edi
+ mov edi,DWORD [12+ebp]
+ add edx,edi
+ mov DWORD [ebp],eax
+ mov DWORD [4+ebp],ebx
+ mov edi,DWORD [esp]
+ mov DWORD [8+ebp],ecx
+ mov DWORD [12+ebp],edx
+ cmp edi,esi
+ jae NEAR L$000start
+ pop eax
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/md5-x86_64-apple.S b/gen/bcm/md5-x86_64-apple.S
new file mode 100644
index 0000000..e4c0241
--- /dev/null
+++ b/gen/bcm/md5-x86_64-apple.S
@@ -0,0 +1,690 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+.p2align 4
+
+.globl _md5_block_asm_data_order
+.private_extern _md5_block_asm_data_order
+
+_md5_block_asm_data_order:
+
+_CET_ENDBR
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r14
+
+ pushq %r15
+
+L$prologue:
+
+
+
+
+ movq %rdi,%rbp
+ shlq $6,%rdx
+ leaq (%rsi,%rdx,1),%rdi
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+
+
+
+
+
+
+ cmpq %rdi,%rsi
+ je L$end
+
+
+L$loop:
+ movl %eax,%r8d
+ movl %ebx,%r9d
+ movl %ecx,%r14d
+ movl %edx,%r15d
+ movl 0(%rsi),%r10d
+ movl %edx,%r11d
+ xorl %ecx,%r11d
+ leal -680876936(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 4(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -389564586(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal 606105819(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1044525330(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 16(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal -176418897(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 20(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal 1200080426(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1473231341(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -45705983(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 32(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1770035416(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 36(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -1958414417(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -42063(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1990404162(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 48(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1804603682(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 52(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -40341101(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1502002290(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal 1236535329(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 0(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ movl 4(%rsi),%r10d
+ movl %edx,%r11d
+ movl %edx,%r12d
+ notl %r11d
+ leal -165796510(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1069501632(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 643717713(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -373897302(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 20(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -701558691(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal 38016083(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -660478335(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 16(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -405537848(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 36(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal 568446438(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1019803690(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -187363961(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 32(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal 1163531501(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 52(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -1444681467(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -51403784(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 1735328473(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 48(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -1926607734(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ movl 20(%rsi),%r10d
+ movl %ecx,%r11d
+ leal -378558(%rax,%r10,1),%eax
+ movl 32(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -2022574463(%rdx,%r10,1),%edx
+ movl 44(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 1839030562(%rcx,%r10,1),%ecx
+ movl 56(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -35309556(%rbx,%r10,1),%ebx
+ movl 4(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -1530992060(%rax,%r10,1),%eax
+ movl 16(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal 1272893353(%rdx,%r10,1),%edx
+ movl 28(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -155497632(%rcx,%r10,1),%ecx
+ movl 40(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -1094730640(%rbx,%r10,1),%ebx
+ movl 52(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal 681279174(%rax,%r10,1),%eax
+ movl 0(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -358537222(%rdx,%r10,1),%edx
+ movl 12(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -722521979(%rcx,%r10,1),%ecx
+ movl 24(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal 76029189(%rbx,%r10,1),%ebx
+ movl 36(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -640364487(%rax,%r10,1),%eax
+ movl 48(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -421815835(%rdx,%r10,1),%edx
+ movl 60(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 530742520(%rcx,%r10,1),%ecx
+ movl 8(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -995338651(%rbx,%r10,1),%ebx
+ movl 0(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ movl 0(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ xorl %edx,%r11d
+ leal -198630844(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 28(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal 1126891415(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 56(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1416354905(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 20(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -57434055(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 48(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1700485571(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 12(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1894986606(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 40(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1051523(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 4(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -2054922799(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 32(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1873313359(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 60(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -30611744(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 24(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1560198380(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 52(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal 1309151649(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 16(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal -145523070(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 44(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1120210379(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 8(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal 718787259(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 36(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -343485551(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 0(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+
+ addl %r8d,%eax
+ addl %r9d,%ebx
+ addl %r14d,%ecx
+ addl %r15d,%edx
+
+
+ addq $64,%rsi
+ cmpq %rdi,%rsi
+ jb L$loop
+
+
+L$end:
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ movq (%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r12
+
+ movq 24(%rsp),%rbx
+
+ movq 32(%rsp),%rbp
+
+ addq $40,%rsp
+
+L$epilogue:
+ ret
+
+
+#endif
diff --git a/gen/bcm/md5-x86_64-linux.S b/gen/bcm/md5-x86_64-linux.S
new file mode 100644
index 0000000..7b93662
--- /dev/null
+++ b/gen/bcm/md5-x86_64-linux.S
@@ -0,0 +1,695 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+.align 16
+
+.globl md5_block_asm_data_order
+.hidden md5_block_asm_data_order
+.type md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+.cfi_startproc
+_CET_ENDBR
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset r12,-32
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset r14,-40
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset r15,-48
+.Lprologue:
+
+
+
+
+ movq %rdi,%rbp
+ shlq $6,%rdx
+ leaq (%rsi,%rdx,1),%rdi
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+
+
+
+
+
+
+ cmpq %rdi,%rsi
+ je .Lend
+
+
+.Lloop:
+ movl %eax,%r8d
+ movl %ebx,%r9d
+ movl %ecx,%r14d
+ movl %edx,%r15d
+ movl 0(%rsi),%r10d
+ movl %edx,%r11d
+ xorl %ecx,%r11d
+ leal -680876936(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 4(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -389564586(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal 606105819(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1044525330(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 16(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal -176418897(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 20(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal 1200080426(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1473231341(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -45705983(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 32(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1770035416(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 36(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -1958414417(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -42063(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1990404162(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 48(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1804603682(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 52(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -40341101(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1502002290(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal 1236535329(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 0(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ movl 4(%rsi),%r10d
+ movl %edx,%r11d
+ movl %edx,%r12d
+ notl %r11d
+ leal -165796510(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1069501632(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 643717713(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -373897302(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 20(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -701558691(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal 38016083(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -660478335(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 16(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -405537848(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 36(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal 568446438(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1019803690(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -187363961(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 32(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal 1163531501(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 52(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -1444681467(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -51403784(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 1735328473(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 48(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -1926607734(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ movl 20(%rsi),%r10d
+ movl %ecx,%r11d
+ leal -378558(%rax,%r10,1),%eax
+ movl 32(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -2022574463(%rdx,%r10,1),%edx
+ movl 44(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 1839030562(%rcx,%r10,1),%ecx
+ movl 56(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -35309556(%rbx,%r10,1),%ebx
+ movl 4(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -1530992060(%rax,%r10,1),%eax
+ movl 16(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal 1272893353(%rdx,%r10,1),%edx
+ movl 28(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -155497632(%rcx,%r10,1),%ecx
+ movl 40(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -1094730640(%rbx,%r10,1),%ebx
+ movl 52(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal 681279174(%rax,%r10,1),%eax
+ movl 0(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -358537222(%rdx,%r10,1),%edx
+ movl 12(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -722521979(%rcx,%r10,1),%ecx
+ movl 24(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal 76029189(%rbx,%r10,1),%ebx
+ movl 36(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -640364487(%rax,%r10,1),%eax
+ movl 48(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -421815835(%rdx,%r10,1),%edx
+ movl 60(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 530742520(%rcx,%r10,1),%ecx
+ movl 8(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -995338651(%rbx,%r10,1),%ebx
+ movl 0(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ movl 0(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ xorl %edx,%r11d
+ leal -198630844(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 28(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal 1126891415(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 56(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1416354905(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 20(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -57434055(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 48(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1700485571(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 12(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1894986606(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 40(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1051523(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 4(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -2054922799(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 32(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1873313359(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 60(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -30611744(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 24(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1560198380(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 52(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal 1309151649(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 16(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal -145523070(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 44(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1120210379(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 8(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal 718787259(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 36(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -343485551(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 0(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+
+ addl %r8d,%eax
+ addl %r9d,%ebx
+ addl %r14d,%ecx
+ addl %r15d,%edx
+
+
+ addq $64,%rsi
+ cmpq %rdi,%rsi
+ jb .Lloop
+
+
+.Lend:
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ movq (%rsp),%r15
+.cfi_restore r15
+ movq 8(%rsp),%r14
+.cfi_restore r14
+ movq 16(%rsp),%r12
+.cfi_restore r12
+ movq 24(%rsp),%rbx
+.cfi_restore rbx
+ movq 32(%rsp),%rbp
+.cfi_restore rbp
+ addq $40,%rsp
+.cfi_adjust_cfa_offset -40
+.Lepilogue:
+ ret
+.cfi_endproc
+.size md5_block_asm_data_order,.-md5_block_asm_data_order
+#endif
diff --git a/gen/bcm/md5-x86_64-win.asm b/gen/bcm/md5-x86_64-win.asm
new file mode 100644
index 0000000..f6c5b62
--- /dev/null
+++ b/gen/bcm/md5-x86_64-win.asm
@@ -0,0 +1,803 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+ALIGN 16
+
+global md5_block_asm_data_order
+
+md5_block_asm_data_order:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_md5_block_asm_data_order:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r14
+
+ push r15
+
+$L$prologue:
+
+
+
+
+ mov rbp,rdi
+ shl rdx,6
+ lea rdi,[rdx*1+rsi]
+ mov eax,DWORD[rbp]
+ mov ebx,DWORD[4+rbp]
+ mov ecx,DWORD[8+rbp]
+ mov edx,DWORD[12+rbp]
+
+
+
+
+
+
+
+ cmp rsi,rdi
+ je NEAR $L$end
+
+
+$L$loop:
+ mov r8d,eax
+ mov r9d,ebx
+ mov r14d,ecx
+ mov r15d,edx
+ mov r10d,DWORD[rsi]
+ mov r11d,edx
+ xor r11d,ecx
+ lea eax,[((-680876936))+r10*1+rax]
+ and r11d,ebx
+ xor r11d,edx
+ mov r10d,DWORD[4+rsi]
+ add eax,r11d
+ rol eax,7
+ mov r11d,ecx
+ add eax,ebx
+ xor r11d,ebx
+ lea edx,[((-389564586))+r10*1+rdx]
+ and r11d,eax
+ xor r11d,ecx
+ mov r10d,DWORD[8+rsi]
+ add edx,r11d
+ rol edx,12
+ mov r11d,ebx
+ add edx,eax
+ xor r11d,eax
+ lea ecx,[606105819+r10*1+rcx]
+ and r11d,edx
+ xor r11d,ebx
+ mov r10d,DWORD[12+rsi]
+ add ecx,r11d
+ rol ecx,17
+ mov r11d,eax
+ add ecx,edx
+ xor r11d,edx
+ lea ebx,[((-1044525330))+r10*1+rbx]
+ and r11d,ecx
+ xor r11d,eax
+ mov r10d,DWORD[16+rsi]
+ add ebx,r11d
+ rol ebx,22
+ mov r11d,edx
+ add ebx,ecx
+ xor r11d,ecx
+ lea eax,[((-176418897))+r10*1+rax]
+ and r11d,ebx
+ xor r11d,edx
+ mov r10d,DWORD[20+rsi]
+ add eax,r11d
+ rol eax,7
+ mov r11d,ecx
+ add eax,ebx
+ xor r11d,ebx
+ lea edx,[1200080426+r10*1+rdx]
+ and r11d,eax
+ xor r11d,ecx
+ mov r10d,DWORD[24+rsi]
+ add edx,r11d
+ rol edx,12
+ mov r11d,ebx
+ add edx,eax
+ xor r11d,eax
+ lea ecx,[((-1473231341))+r10*1+rcx]
+ and r11d,edx
+ xor r11d,ebx
+ mov r10d,DWORD[28+rsi]
+ add ecx,r11d
+ rol ecx,17
+ mov r11d,eax
+ add ecx,edx
+ xor r11d,edx
+ lea ebx,[((-45705983))+r10*1+rbx]
+ and r11d,ecx
+ xor r11d,eax
+ mov r10d,DWORD[32+rsi]
+ add ebx,r11d
+ rol ebx,22
+ mov r11d,edx
+ add ebx,ecx
+ xor r11d,ecx
+ lea eax,[1770035416+r10*1+rax]
+ and r11d,ebx
+ xor r11d,edx
+ mov r10d,DWORD[36+rsi]
+ add eax,r11d
+ rol eax,7
+ mov r11d,ecx
+ add eax,ebx
+ xor r11d,ebx
+ lea edx,[((-1958414417))+r10*1+rdx]
+ and r11d,eax
+ xor r11d,ecx
+ mov r10d,DWORD[40+rsi]
+ add edx,r11d
+ rol edx,12
+ mov r11d,ebx
+ add edx,eax
+ xor r11d,eax
+ lea ecx,[((-42063))+r10*1+rcx]
+ and r11d,edx
+ xor r11d,ebx
+ mov r10d,DWORD[44+rsi]
+ add ecx,r11d
+ rol ecx,17
+ mov r11d,eax
+ add ecx,edx
+ xor r11d,edx
+ lea ebx,[((-1990404162))+r10*1+rbx]
+ and r11d,ecx
+ xor r11d,eax
+ mov r10d,DWORD[48+rsi]
+ add ebx,r11d
+ rol ebx,22
+ mov r11d,edx
+ add ebx,ecx
+ xor r11d,ecx
+ lea eax,[1804603682+r10*1+rax]
+ and r11d,ebx
+ xor r11d,edx
+ mov r10d,DWORD[52+rsi]
+ add eax,r11d
+ rol eax,7
+ mov r11d,ecx
+ add eax,ebx
+ xor r11d,ebx
+ lea edx,[((-40341101))+r10*1+rdx]
+ and r11d,eax
+ xor r11d,ecx
+ mov r10d,DWORD[56+rsi]
+ add edx,r11d
+ rol edx,12
+ mov r11d,ebx
+ add edx,eax
+ xor r11d,eax
+ lea ecx,[((-1502002290))+r10*1+rcx]
+ and r11d,edx
+ xor r11d,ebx
+ mov r10d,DWORD[60+rsi]
+ add ecx,r11d
+ rol ecx,17
+ mov r11d,eax
+ add ecx,edx
+ xor r11d,edx
+ lea ebx,[1236535329+r10*1+rbx]
+ and r11d,ecx
+ xor r11d,eax
+ mov r10d,DWORD[rsi]
+ add ebx,r11d
+ rol ebx,22
+ mov r11d,edx
+ add ebx,ecx
+ mov r10d,DWORD[4+rsi]
+ mov r11d,edx
+ mov r12d,edx
+ not r11d
+ lea eax,[((-165796510))+r10*1+rax]
+ and r12d,ebx
+ and r11d,ecx
+ mov r10d,DWORD[24+rsi]
+ or r12d,r11d
+ mov r11d,ecx
+ add eax,r12d
+ mov r12d,ecx
+ rol eax,5
+ add eax,ebx
+ not r11d
+ lea edx,[((-1069501632))+r10*1+rdx]
+ and r12d,eax
+ and r11d,ebx
+ mov r10d,DWORD[44+rsi]
+ or r12d,r11d
+ mov r11d,ebx
+ add edx,r12d
+ mov r12d,ebx
+ rol edx,9
+ add edx,eax
+ not r11d
+ lea ecx,[643717713+r10*1+rcx]
+ and r12d,edx
+ and r11d,eax
+ mov r10d,DWORD[rsi]
+ or r12d,r11d
+ mov r11d,eax
+ add ecx,r12d
+ mov r12d,eax
+ rol ecx,14
+ add ecx,edx
+ not r11d
+ lea ebx,[((-373897302))+r10*1+rbx]
+ and r12d,ecx
+ and r11d,edx
+ mov r10d,DWORD[20+rsi]
+ or r12d,r11d
+ mov r11d,edx
+ add ebx,r12d
+ mov r12d,edx
+ rol ebx,20
+ add ebx,ecx
+ not r11d
+ lea eax,[((-701558691))+r10*1+rax]
+ and r12d,ebx
+ and r11d,ecx
+ mov r10d,DWORD[40+rsi]
+ or r12d,r11d
+ mov r11d,ecx
+ add eax,r12d
+ mov r12d,ecx
+ rol eax,5
+ add eax,ebx
+ not r11d
+ lea edx,[38016083+r10*1+rdx]
+ and r12d,eax
+ and r11d,ebx
+ mov r10d,DWORD[60+rsi]
+ or r12d,r11d
+ mov r11d,ebx
+ add edx,r12d
+ mov r12d,ebx
+ rol edx,9
+ add edx,eax
+ not r11d
+ lea ecx,[((-660478335))+r10*1+rcx]
+ and r12d,edx
+ and r11d,eax
+ mov r10d,DWORD[16+rsi]
+ or r12d,r11d
+ mov r11d,eax
+ add ecx,r12d
+ mov r12d,eax
+ rol ecx,14
+ add ecx,edx
+ not r11d
+ lea ebx,[((-405537848))+r10*1+rbx]
+ and r12d,ecx
+ and r11d,edx
+ mov r10d,DWORD[36+rsi]
+ or r12d,r11d
+ mov r11d,edx
+ add ebx,r12d
+ mov r12d,edx
+ rol ebx,20
+ add ebx,ecx
+ not r11d
+ lea eax,[568446438+r10*1+rax]
+ and r12d,ebx
+ and r11d,ecx
+ mov r10d,DWORD[56+rsi]
+ or r12d,r11d
+ mov r11d,ecx
+ add eax,r12d
+ mov r12d,ecx
+ rol eax,5
+ add eax,ebx
+ not r11d
+ lea edx,[((-1019803690))+r10*1+rdx]
+ and r12d,eax
+ and r11d,ebx
+ mov r10d,DWORD[12+rsi]
+ or r12d,r11d
+ mov r11d,ebx
+ add edx,r12d
+ mov r12d,ebx
+ rol edx,9
+ add edx,eax
+ not r11d
+ lea ecx,[((-187363961))+r10*1+rcx]
+ and r12d,edx
+ and r11d,eax
+ mov r10d,DWORD[32+rsi]
+ or r12d,r11d
+ mov r11d,eax
+ add ecx,r12d
+ mov r12d,eax
+ rol ecx,14
+ add ecx,edx
+ not r11d
+ lea ebx,[1163531501+r10*1+rbx]
+ and r12d,ecx
+ and r11d,edx
+ mov r10d,DWORD[52+rsi]
+ or r12d,r11d
+ mov r11d,edx
+ add ebx,r12d
+ mov r12d,edx
+ rol ebx,20
+ add ebx,ecx
+ not r11d
+ lea eax,[((-1444681467))+r10*1+rax]
+ and r12d,ebx
+ and r11d,ecx
+ mov r10d,DWORD[8+rsi]
+ or r12d,r11d
+ mov r11d,ecx
+ add eax,r12d
+ mov r12d,ecx
+ rol eax,5
+ add eax,ebx
+ not r11d
+ lea edx,[((-51403784))+r10*1+rdx]
+ and r12d,eax
+ and r11d,ebx
+ mov r10d,DWORD[28+rsi]
+ or r12d,r11d
+ mov r11d,ebx
+ add edx,r12d
+ mov r12d,ebx
+ rol edx,9
+ add edx,eax
+ not r11d
+ lea ecx,[1735328473+r10*1+rcx]
+ and r12d,edx
+ and r11d,eax
+ mov r10d,DWORD[48+rsi]
+ or r12d,r11d
+ mov r11d,eax
+ add ecx,r12d
+ mov r12d,eax
+ rol ecx,14
+ add ecx,edx
+ not r11d
+ lea ebx,[((-1926607734))+r10*1+rbx]
+ and r12d,ecx
+ and r11d,edx
+ mov r10d,DWORD[rsi]
+ or r12d,r11d
+ mov r11d,edx
+ add ebx,r12d
+ mov r12d,edx
+ rol ebx,20
+ add ebx,ecx
+ mov r10d,DWORD[20+rsi]
+ mov r11d,ecx
+ lea eax,[((-378558))+r10*1+rax]
+ mov r10d,DWORD[32+rsi]
+ xor r11d,edx
+ xor r11d,ebx
+ add eax,r11d
+ rol eax,4
+ mov r11d,ebx
+ add eax,ebx
+ lea edx,[((-2022574463))+r10*1+rdx]
+ mov r10d,DWORD[44+rsi]
+ xor r11d,ecx
+ xor r11d,eax
+ add edx,r11d
+ rol edx,11
+ mov r11d,eax
+ add edx,eax
+ lea ecx,[1839030562+r10*1+rcx]
+ mov r10d,DWORD[56+rsi]
+ xor r11d,ebx
+ xor r11d,edx
+ add ecx,r11d
+ rol ecx,16
+ mov r11d,edx
+ add ecx,edx
+ lea ebx,[((-35309556))+r10*1+rbx]
+ mov r10d,DWORD[4+rsi]
+ xor r11d,eax
+ xor r11d,ecx
+ add ebx,r11d
+ rol ebx,23
+ mov r11d,ecx
+ add ebx,ecx
+ lea eax,[((-1530992060))+r10*1+rax]
+ mov r10d,DWORD[16+rsi]
+ xor r11d,edx
+ xor r11d,ebx
+ add eax,r11d
+ rol eax,4
+ mov r11d,ebx
+ add eax,ebx
+ lea edx,[1272893353+r10*1+rdx]
+ mov r10d,DWORD[28+rsi]
+ xor r11d,ecx
+ xor r11d,eax
+ add edx,r11d
+ rol edx,11
+ mov r11d,eax
+ add edx,eax
+ lea ecx,[((-155497632))+r10*1+rcx]
+ mov r10d,DWORD[40+rsi]
+ xor r11d,ebx
+ xor r11d,edx
+ add ecx,r11d
+ rol ecx,16
+ mov r11d,edx
+ add ecx,edx
+ lea ebx,[((-1094730640))+r10*1+rbx]
+ mov r10d,DWORD[52+rsi]
+ xor r11d,eax
+ xor r11d,ecx
+ add ebx,r11d
+ rol ebx,23
+ mov r11d,ecx
+ add ebx,ecx
+ lea eax,[681279174+r10*1+rax]
+ mov r10d,DWORD[rsi]
+ xor r11d,edx
+ xor r11d,ebx
+ add eax,r11d
+ rol eax,4
+ mov r11d,ebx
+ add eax,ebx
+ lea edx,[((-358537222))+r10*1+rdx]
+ mov r10d,DWORD[12+rsi]
+ xor r11d,ecx
+ xor r11d,eax
+ add edx,r11d
+ rol edx,11
+ mov r11d,eax
+ add edx,eax
+ lea ecx,[((-722521979))+r10*1+rcx]
+ mov r10d,DWORD[24+rsi]
+ xor r11d,ebx
+ xor r11d,edx
+ add ecx,r11d
+ rol ecx,16
+ mov r11d,edx
+ add ecx,edx
+ lea ebx,[76029189+r10*1+rbx]
+ mov r10d,DWORD[36+rsi]
+ xor r11d,eax
+ xor r11d,ecx
+ add ebx,r11d
+ rol ebx,23
+ mov r11d,ecx
+ add ebx,ecx
+ lea eax,[((-640364487))+r10*1+rax]
+ mov r10d,DWORD[48+rsi]
+ xor r11d,edx
+ xor r11d,ebx
+ add eax,r11d
+ rol eax,4
+ mov r11d,ebx
+ add eax,ebx
+ lea edx,[((-421815835))+r10*1+rdx]
+ mov r10d,DWORD[60+rsi]
+ xor r11d,ecx
+ xor r11d,eax
+ add edx,r11d
+ rol edx,11
+ mov r11d,eax
+ add edx,eax
+ lea ecx,[530742520+r10*1+rcx]
+ mov r10d,DWORD[8+rsi]
+ xor r11d,ebx
+ xor r11d,edx
+ add ecx,r11d
+ rol ecx,16
+ mov r11d,edx
+ add ecx,edx
+ lea ebx,[((-995338651))+r10*1+rbx]
+ mov r10d,DWORD[rsi]
+ xor r11d,eax
+ xor r11d,ecx
+ add ebx,r11d
+ rol ebx,23
+ mov r11d,ecx
+ add ebx,ecx
+ mov r10d,DWORD[rsi]
+ mov r11d,0xffffffff
+ xor r11d,edx
+ lea eax,[((-198630844))+r10*1+rax]
+ or r11d,ebx
+ xor r11d,ecx
+ add eax,r11d
+ mov r10d,DWORD[28+rsi]
+ mov r11d,0xffffffff
+ rol eax,6
+ xor r11d,ecx
+ add eax,ebx
+ lea edx,[1126891415+r10*1+rdx]
+ or r11d,eax
+ xor r11d,ebx
+ add edx,r11d
+ mov r10d,DWORD[56+rsi]
+ mov r11d,0xffffffff
+ rol edx,10
+ xor r11d,ebx
+ add edx,eax
+ lea ecx,[((-1416354905))+r10*1+rcx]
+ or r11d,edx
+ xor r11d,eax
+ add ecx,r11d
+ mov r10d,DWORD[20+rsi]
+ mov r11d,0xffffffff
+ rol ecx,15
+ xor r11d,eax
+ add ecx,edx
+ lea ebx,[((-57434055))+r10*1+rbx]
+ or r11d,ecx
+ xor r11d,edx
+ add ebx,r11d
+ mov r10d,DWORD[48+rsi]
+ mov r11d,0xffffffff
+ rol ebx,21
+ xor r11d,edx
+ add ebx,ecx
+ lea eax,[1700485571+r10*1+rax]
+ or r11d,ebx
+ xor r11d,ecx
+ add eax,r11d
+ mov r10d,DWORD[12+rsi]
+ mov r11d,0xffffffff
+ rol eax,6
+ xor r11d,ecx
+ add eax,ebx
+ lea edx,[((-1894986606))+r10*1+rdx]
+ or r11d,eax
+ xor r11d,ebx
+ add edx,r11d
+ mov r10d,DWORD[40+rsi]
+ mov r11d,0xffffffff
+ rol edx,10
+ xor r11d,ebx
+ add edx,eax
+ lea ecx,[((-1051523))+r10*1+rcx]
+ or r11d,edx
+ xor r11d,eax
+ add ecx,r11d
+ mov r10d,DWORD[4+rsi]
+ mov r11d,0xffffffff
+ rol ecx,15
+ xor r11d,eax
+ add ecx,edx
+ lea ebx,[((-2054922799))+r10*1+rbx]
+ or r11d,ecx
+ xor r11d,edx
+ add ebx,r11d
+ mov r10d,DWORD[32+rsi]
+ mov r11d,0xffffffff
+ rol ebx,21
+ xor r11d,edx
+ add ebx,ecx
+ lea eax,[1873313359+r10*1+rax]
+ or r11d,ebx
+ xor r11d,ecx
+ add eax,r11d
+ mov r10d,DWORD[60+rsi]
+ mov r11d,0xffffffff
+ rol eax,6
+ xor r11d,ecx
+ add eax,ebx
+ lea edx,[((-30611744))+r10*1+rdx]
+ or r11d,eax
+ xor r11d,ebx
+ add edx,r11d
+ mov r10d,DWORD[24+rsi]
+ mov r11d,0xffffffff
+ rol edx,10
+ xor r11d,ebx
+ add edx,eax
+ lea ecx,[((-1560198380))+r10*1+rcx]
+ or r11d,edx
+ xor r11d,eax
+ add ecx,r11d
+ mov r10d,DWORD[52+rsi]
+ mov r11d,0xffffffff
+ rol ecx,15
+ xor r11d,eax
+ add ecx,edx
+ lea ebx,[1309151649+r10*1+rbx]
+ or r11d,ecx
+ xor r11d,edx
+ add ebx,r11d
+ mov r10d,DWORD[16+rsi]
+ mov r11d,0xffffffff
+ rol ebx,21
+ xor r11d,edx
+ add ebx,ecx
+ lea eax,[((-145523070))+r10*1+rax]
+ or r11d,ebx
+ xor r11d,ecx
+ add eax,r11d
+ mov r10d,DWORD[44+rsi]
+ mov r11d,0xffffffff
+ rol eax,6
+ xor r11d,ecx
+ add eax,ebx
+ lea edx,[((-1120210379))+r10*1+rdx]
+ or r11d,eax
+ xor r11d,ebx
+ add edx,r11d
+ mov r10d,DWORD[8+rsi]
+ mov r11d,0xffffffff
+ rol edx,10
+ xor r11d,ebx
+ add edx,eax
+ lea ecx,[718787259+r10*1+rcx]
+ or r11d,edx
+ xor r11d,eax
+ add ecx,r11d
+ mov r10d,DWORD[36+rsi]
+ mov r11d,0xffffffff
+ rol ecx,15
+ xor r11d,eax
+ add ecx,edx
+ lea ebx,[((-343485551))+r10*1+rbx]
+ or r11d,ecx
+ xor r11d,edx
+ add ebx,r11d
+ mov r10d,DWORD[rsi]
+ mov r11d,0xffffffff
+ rol ebx,21
+ xor r11d,edx
+ add ebx,ecx
+
+ add eax,r8d
+ add ebx,r9d
+ add ecx,r14d
+ add edx,r15d
+
+
+ add rsi,64
+ cmp rsi,rdi
+ jb NEAR $L$loop
+
+
+$L$end:
+ mov DWORD[rbp],eax
+ mov DWORD[4+rbp],ebx
+ mov DWORD[8+rbp],ecx
+ mov DWORD[12+rbp],edx
+
+ mov r15,QWORD[rsp]
+
+ mov r14,QWORD[8+rsp]
+
+ mov r12,QWORD[16+rsp]
+
+ mov rbx,QWORD[24+rsp]
+
+ mov rbp,QWORD[32+rsp]
+
+ add rsp,40
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_md5_block_asm_data_order:
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ lea rax,[40+rax]
+
+ mov rbp,QWORD[((-8))+rax]
+ mov rbx,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r14,QWORD[((-32))+rax]
+ mov r15,QWORD[((-40))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_md5_block_asm_data_order wrt ..imagebase
+ DD $L$SEH_end_md5_block_asm_data_order wrt ..imagebase
+ DD $L$SEH_info_md5_block_asm_data_order wrt ..imagebase
+
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_md5_block_asm_data_order:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/p256-armv8-asm-apple.S b/gen/bcm/p256-armv8-asm-apple.S
new file mode 100644
index 0000000..c8469e6
--- /dev/null
+++ b/gen/bcm/p256-armv8-asm-apple.S
@@ -0,0 +1,1726 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include "openssl/arm_arch.h"
+
+.section __TEXT,__const
+.align 5
+Lpoly:
+.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+LRR: // 2^512 mod P precomputed for NIST P256 polynomial
+.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+Lone_mont:
+.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+Lone:
+.quad 1,0,0,0
+Lord:
+.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+LordK:
+.quad 0xccd1c8aaee00bc4f
+.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.text
+
+// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl _ecp_nistz256_mul_mont
+.private_extern _ecp_nistz256_mul_mont
+
+.align 4
+_ecp_nistz256_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldr x3,[x2] // bp[0]
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_mul_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl _ecp_nistz256_sqr_mont
+.private_extern _ecp_nistz256_sqr_mont
+
+.align 4
+_ecp_nistz256_sqr_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sqr_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl _ecp_nistz256_div_by_2
+.private_extern _ecp_nistz256_div_by_2
+
+.align 4
+_ecp_nistz256_div_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_div_by_2
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl _ecp_nistz256_mul_by_2
+.private_extern _ecp_nistz256_mul_by_2
+
+.align 4
+_ecp_nistz256_mul_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl _ecp_nistz256_mul_by_3
+.private_extern _ecp_nistz256_mul_by_3
+
+.align 4
+_ecp_nistz256_mul_by_3:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ mov x4,x14
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ mov x8,x4
+ mov x9,x5
+ mov x10,x6
+ mov x11,x7
+
+ bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl _ecp_nistz256_sub
+.private_extern _ecp_nistz256_sub
+
+.align 4
+_ecp_nistz256_sub:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl _ecp_nistz256_neg
+.private_extern _ecp_nistz256_neg
+
+.align 4
+_ecp_nistz256_neg:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x2,x1
+ mov x14,xzr // a = 0
+ mov x15,xzr
+ mov x16,xzr
+ mov x17,xzr
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+
+.align 4
+__ecp_nistz256_mul_mont:
+ mul x14,x4,x3 // a[0]*b[0]
+ umulh x8,x4,x3
+
+ mul x15,x5,x3 // a[1]*b[0]
+ umulh x9,x5,x3
+
+ mul x16,x6,x3 // a[2]*b[0]
+ umulh x10,x6,x3
+
+ mul x17,x7,x3 // a[3]*b[0]
+ umulh x11,x7,x3
+ ldr x3,[x2,#8] // b[1]
+
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adc x19,xzr,x11
+ mov x20,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ ldr x3,[x2,#8*(1+1)] // b[1+1]
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ ldr x3,[x2,#8*(2+1)] // b[2+1]
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ // last reduction
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ adcs x17,x19,x11
+ adc x19,x20,xzr
+
+ adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x19,xzr // did it borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+
+.align 4
+__ecp_nistz256_sqr_mont:
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul x15,x5,x4 // a[1]*a[0]
+ umulh x9,x5,x4
+ mul x16,x6,x4 // a[2]*a[0]
+ umulh x10,x6,x4
+ mul x17,x7,x4 // a[3]*a[0]
+ umulh x19,x7,x4
+
+ adds x16,x16,x9 // accumulate high parts of multiplication
+ mul x8,x6,x5 // a[2]*a[1]
+ umulh x9,x6,x5
+ adcs x17,x17,x10
+ mul x10,x7,x5 // a[3]*a[1]
+ umulh x11,x7,x5
+ adc x19,x19,xzr // can't overflow
+
+ mul x20,x7,x6 // a[3]*a[2]
+ umulh x1,x7,x6
+
+ adds x9,x9,x10 // accumulate high parts of multiplication
+ mul x14,x4,x4 // a[0]*a[0]
+ adc x10,x11,xzr // can't overflow
+
+ adds x17,x17,x8 // accumulate low parts of multiplication
+ umulh x4,x4,x4
+ adcs x19,x19,x9
+ mul x9,x5,x5 // a[1]*a[1]
+ adcs x20,x20,x10
+ umulh x5,x5,x5
+ adc x1,x1,xzr // can't overflow
+
+ adds x15,x15,x15 // acc[1-6]*=2
+ mul x10,x6,x6 // a[2]*a[2]
+ adcs x16,x16,x16
+ umulh x6,x6,x6
+ adcs x17,x17,x17
+ mul x11,x7,x7 // a[3]*a[3]
+ adcs x19,x19,x19
+ umulh x7,x7,x7
+ adcs x20,x20,x20
+ adcs x1,x1,x1
+ adc x2,xzr,xzr
+
+ adds x15,x15,x4 // +a[i]*a[i]
+ adcs x16,x16,x9
+ adcs x17,x17,x5
+ adcs x19,x19,x10
+ adcs x20,x20,x6
+ lsl x8,x14,#32
+ adcs x1,x1,x11
+ lsr x9,x14,#32
+ adc x2,x2,x7
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ adc x17,x11,xzr // can't overflow
+
+ adds x14,x14,x19 // accumulate upper half
+ adcs x15,x15,x20
+ adcs x16,x16,x1
+ adcs x17,x17,x2
+ adc x19,xzr,xzr
+
+ adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x19,xzr // did it borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+
+.align 4
+__ecp_nistz256_add_to:
+ adds x14,x14,x8 // ret = a+b
+ adcs x15,x15,x9
+ adcs x16,x16,x10
+ adcs x17,x17,x11
+ adc x1,xzr,xzr // zap x1
+
+ adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+
+.align 4
+__ecp_nistz256_sub_from:
+ ldp x8,x9,[x2]
+ ldp x10,x11,[x2,#16]
+ subs x14,x14,x8 // ret = a-b
+ sbcs x15,x15,x9
+ sbcs x16,x16,x10
+ sbcs x17,x17,x11
+ sbc x1,xzr,xzr // zap x1
+
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adc x11,x17,x13
+ cmp x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ stp x14,x15,[x0]
+ csel x17,x17,x11,eq
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+
+.align 4
+__ecp_nistz256_sub_morf:
+ ldp x8,x9,[x2]
+ ldp x10,x11,[x2,#16]
+ subs x14,x8,x14 // ret = b-a
+ sbcs x15,x9,x15
+ sbcs x16,x10,x16
+ sbcs x17,x11,x17
+ sbc x1,xzr,xzr // zap x1
+
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adc x11,x17,x13
+ cmp x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ stp x14,x15,[x0]
+ csel x17,x17,x11,eq
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+
+.align 4
+__ecp_nistz256_div_by_2:
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adcs x11,x17,x13
+ adc x1,xzr,xzr // zap x1
+ tst x14,#1 // is a even?
+
+ csel x14,x14,x8,eq // ret = even ? a : a+modulus
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ csel x17,x17,x11,eq
+ csel x1,xzr,x1,eq
+
+ lsr x14,x14,#1 // ret >>= 1
+ orr x14,x14,x15,lsl#63
+ lsr x15,x15,#1
+ orr x15,x15,x16,lsl#63
+ lsr x16,x16,#1
+ orr x16,x16,x17,lsl#63
+ lsr x17,x17,#1
+ stp x14,x15,[x0]
+ orr x17,x17,x1,lsl#63
+ stp x16,x17,[x0,#16]
+
+ ret
+
+.globl _ecp_nistz256_point_double
+.private_extern _ecp_nistz256_point_double
+
+.align 5
+_ecp_nistz256_point_double:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ sub sp,sp,#32*4
+
+Ldouble_shortcut:
+ ldp x14,x15,[x1,#32]
+ mov x21,x0
+ ldp x16,x17,[x1,#48]
+ mov x22,x1
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ mov x8,x14
+ ldr x13,[x13,#24]
+ mov x9,x15
+ ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[x22,#64+16]
+ add x0,sp,#0
+ bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y);
+
+ add x0,sp,#64
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
+
+ ldp x8,x9,[x22]
+ ldp x10,x11,[x22,#16]
+ mov x4,x14 // put Zsqr aside for p256_sub
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x0,sp,#32
+ bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x);
+
+ add x2,x22,#0
+ mov x14,x4 // restore Zsqr
+ mov x15,x5
+ ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
+ mov x16,x6
+ mov x17,x7
+ ldp x6,x7,[sp,#0+16]
+ add x0,sp,#64
+ bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
+
+ add x0,sp,#0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
+
+ ldr x3,[x22,#32]
+ ldp x4,x5,[x22,#64]
+ ldp x6,x7,[x22,#64+16]
+ add x2,x22,#32
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
+
+ mov x8,x14
+ mov x9,x15
+ ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[sp,#0+16]
+ add x0,x21,#64
+ bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0);
+
+ add x0,sp,#96
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
+
+ ldr x3,[sp,#64] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x0,x21,#32
+ bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
+
+ add x2,sp,#64
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
+
+ mov x8,x14 // duplicate M
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ mov x4,x14 // put M aside
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x0,sp,#32
+ bl __ecp_nistz256_add_to
+ mov x8,x4 // restore M
+ mov x9,x5
+ ldr x3,[x22] // forward load for p256_mul_mont
+ mov x10,x6
+ ldp x4,x5,[sp,#0]
+ mov x11,x7
+ ldp x6,x7,[sp,#0+16]
+ bl __ecp_nistz256_add_to // p256_mul_by_3(M, M);
+
+ add x2,x22,#0
+ add x0,sp,#0
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
+
+ mov x8,x14
+ mov x9,x15
+ ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[sp,#32+16]
+ add x0,sp,#96
+ bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S);
+
+ add x0,x21,#0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
+
+ add x2,sp,#96
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
+
+ add x2,sp,#0
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
+
+ ldr x3,[sp,#32]
+ mov x4,x14 // copy S
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x2,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
+
+ add x2,x21,#32
+ add x0,x21,#32
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _ecp_nistz256_point_add
+.private_extern _ecp_nistz256_point_add
+
+.align 5
+_ecp_nistz256_point_add:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#32*12
+
+ ldp x4,x5,[x2,#64] // in2_z
+ ldp x6,x7,[x2,#64+16]
+ mov x21,x0
+ mov x22,x1
+ mov x23,x2
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x25,x8,x10
+ cmp x25,#0
+ csetm x25,ne // ~in2infty
+ add x0,sp,#192
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
+
+ ldp x4,x5,[x22,#64] // in1_z
+ ldp x6,x7,[x22,#64+16]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x24,x8,x10
+ cmp x24,#0
+ csetm x24,ne // ~in1infty
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ ldr x3,[x23,#64]
+ ldp x4,x5,[sp,#192]
+ ldp x6,x7,[sp,#192+16]
+ add x2,x23,#64
+ add x0,sp,#320
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,x22,#64
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr x3,[x22,#32]
+ ldp x4,x5,[sp,#320]
+ ldp x6,x7,[sp,#320+16]
+ add x2,x22,#32
+ add x0,sp,#320
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
+
+ ldr x3,[x23,#32]
+ ldp x4,x5,[sp,#352]
+ ldp x6,x7,[sp,#352+16]
+ add x2,x23,#32
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add x2,sp,#320
+ ldr x3,[sp,#192] // forward load for p256_mul_mont
+ ldp x4,x5,[x22]
+ ldp x6,x7,[x22,#16]
+ add x0,sp,#160
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
+
+ orr x14,x14,x15 // see if result is zero
+ orr x16,x16,x17
+ orr x26,x14,x16 // ~is_equal(S1,S2)
+
+ add x2,sp,#192
+ add x0,sp,#256
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
+
+ ldr x3,[sp,#128]
+ ldp x4,x5,[x23]
+ ldp x6,x7,[x23,#16]
+ add x2,sp,#128
+ add x0,sp,#288
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
+
+ add x2,sp,#256
+ ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
+ ldp x6,x7,[sp,#160+16]
+ add x0,sp,#96
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
+
+ orr x14,x14,x15 // see if result is zero
+ orr x16,x16,x17
+ orr x14,x14,x16 // ~is_equal(U1,U2)
+
+ mvn x27,x24 // -1/0 -> 0/-1
+ mvn x28,x25 // -1/0 -> 0/-1
+ orr x14,x14,x27
+ orr x14,x14,x28
+ orr x14,x14,x26
+ cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+Ladd_double:
+ mov x1,x22
+ mov x0,x21
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames
+ b Ldouble_shortcut
+
+.align 4
+Ladd_proceed:
+ add x0,sp,#192
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#96]
+ ldp x6,x7,[sp,#96+16]
+ add x2,x22,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldp x4,x5,[sp,#96]
+ ldp x6,x7,[sp,#96+16]
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldr x3,[x23,#64]
+ ldp x4,x5,[sp,#64]
+ ldp x6,x7,[sp,#64+16]
+ add x2,x23,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
+
+ ldr x3,[sp,#96]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,sp,#96
+ add x0,sp,#224
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr x3,[sp,#128]
+ ldp x4,x5,[sp,#256]
+ ldp x6,x7,[sp,#256+16]
+ add x2,sp,#128
+ add x0,sp,#288
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
+
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ add x0,sp,#128
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add x2,sp,#192
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add x2,sp,#224
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add x2,sp,#288
+ ldr x3,[sp,#224] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#320]
+ ldp x6,x7,[sp,#320+16]
+ add x0,sp,#32
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add x2,sp,#224
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
+
+ ldr x3,[sp,#160]
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x2,sp,#160
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add x2,sp,#352
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp x4,x5,[sp,#0] // res
+ ldp x6,x7,[sp,#0+16]
+ ldp x8,x9,[x23] // in2
+ ldp x10,x11,[x23,#16]
+ ldp x14,x15,[x22,#0] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#0+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+0+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+0+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#0+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#0+48]
+ stp x14,x15,[x21,#0]
+ stp x16,x17,[x21,#0+16]
+ ldp x14,x15,[x22,#32] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#32+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+32+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+32+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#32+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#32+48]
+ stp x14,x15,[x21,#32]
+ stp x16,x17,[x21,#32+16]
+ ldp x14,x15,[x22,#64] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#64+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ stp x14,x15,[x21,#64]
+ stp x16,x17,[x21,#64+16]
+
+Ladd_done:
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _ecp_nistz256_point_add_affine
+.private_extern _ecp_nistz256_point_add_affine
+
+.align 5
+_ecp_nistz256_point_add_affine:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ sub sp,sp,#32*10
+
+ mov x21,x0
+ mov x22,x1
+ mov x23,x2
+ adrp x13,Lpoly@PAGE
+ add x13,x13,Lpoly@PAGEOFF
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ ldp x4,x5,[x1,#64] // in1_z
+ ldp x6,x7,[x1,#64+16]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x24,x8,x10
+ cmp x24,#0
+ csetm x24,ne // ~in1infty
+
+ ldp x14,x15,[x2] // in2_x
+ ldp x16,x17,[x2,#16]
+ ldp x8,x9,[x2,#32] // in2_y
+ ldp x10,x11,[x2,#48]
+ orr x14,x14,x15
+ orr x16,x16,x17
+ orr x8,x8,x9
+ orr x10,x10,x11
+ orr x14,x14,x16
+ orr x8,x8,x10
+ orr x25,x14,x8
+ cmp x25,#0
+ csetm x25,ne // ~in2infty
+
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ mov x4,x14
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ ldr x3,[x23]
+ add x2,x23,#0
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
+
+ add x2,x22,#0
+ ldr x3,[x22,#64] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x0,sp,#160
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
+
+ add x2,x22,#64
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#160]
+ ldp x6,x7,[sp,#160+16]
+ add x2,x22,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldr x3,[x23,#32]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,x23,#32
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add x2,x22,#32
+ ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
+ ldp x6,x7,[sp,#160+16]
+ add x0,sp,#192
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
+
+ add x0,sp,#224
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldp x4,x5,[sp,#192]
+ ldp x6,x7,[sp,#192+16]
+ add x0,sp,#288
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr x3,[sp,#160]
+ ldp x4,x5,[sp,#224]
+ ldp x6,x7,[sp,#224+16]
+ add x2,sp,#160
+ add x0,sp,#256
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr x3,[x22]
+ ldp x4,x5,[sp,#224]
+ ldp x6,x7,[sp,#224+16]
+ add x2,x22,#0
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
+
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ add x0,sp,#224
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add x2,sp,#288
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add x2,sp,#256
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add x2,sp,#96
+ ldr x3,[x22,#32] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#256]
+ ldp x6,x7,[sp,#256+16]
+ add x0,sp,#32
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add x2,x22,#32
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
+
+ ldr x3,[sp,#192]
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x2,sp,#192
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add x2,sp,#128
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp x4,x5,[sp,#0] // res
+ ldp x6,x7,[sp,#0+16]
+ ldp x8,x9,[x23] // in2
+ ldp x10,x11,[x23,#16]
+ ldp x14,x15,[x22,#0] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#0+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+0+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+0+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#0+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#0+48]
+ stp x14,x15,[x21,#0]
+ stp x16,x17,[x21,#0+16]
+ adrp x23,Lone_mont@PAGE-64
+ add x23,x23,Lone_mont@PAGEOFF-64
+ ldp x14,x15,[x22,#32] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#32+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+32+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+32+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#32+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#32+48]
+ stp x14,x15,[x21,#32]
+ stp x16,x17,[x21,#32+16]
+ ldp x14,x15,[x22,#64] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#64+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ stp x14,x15,[x21,#64]
+ stp x16,x17,[x21,#64+16]
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x29,x30,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t b[4]);
+.globl _ecp_nistz256_ord_mul_mont
+.private_extern _ecp_nistz256_ord_mul_mont
+
+.align 4
+_ecp_nistz256_ord_mul_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adrp x23,Lord@PAGE
+ add x23,x23,Lord@PAGEOFF
+ ldr x3,[x2] // bp[0]
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+
+ ldp x12,x13,[x23,#0]
+ ldp x21,x22,[x23,#16]
+ ldr x23,[x23,#32]
+
+ mul x14,x4,x3 // a[0]*b[0]
+ umulh x8,x4,x3
+
+ mul x15,x5,x3 // a[1]*b[0]
+ umulh x9,x5,x3
+
+ mul x16,x6,x3 // a[2]*b[0]
+ umulh x10,x6,x3
+
+ mul x17,x7,x3 // a[3]*b[0]
+ umulh x19,x7,x3
+
+ mul x24,x14,x23
+
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adc x19,x19,xzr
+ mov x20,xzr
+ ldr x3,[x2,#8*1] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ ldr x3,[x2,#8*2] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ ldr x3,[x2,#8*3] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ lsl x8,x24,#32 // last reduction
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ subs x8,x14,x12 // ret -= modulus
+ sbcs x9,x15,x13
+ sbcs x10,x16,x21
+ sbcs x11,x17,x22
+ sbcs xzr,x19,xzr
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t rep);
+.globl _ecp_nistz256_ord_sqr_mont
+.private_extern _ecp_nistz256_ord_sqr_mont
+
+.align 4
+_ecp_nistz256_ord_sqr_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adrp x23,Lord@PAGE
+ add x23,x23,Lord@PAGEOFF
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+
+ ldp x12,x13,[x23,#0]
+ ldp x21,x22,[x23,#16]
+ ldr x23,[x23,#32]
+ b Loop_ord_sqr
+
+.align 4
+Loop_ord_sqr:
+ sub x2,x2,#1
+ ////////////////////////////////////////////////////////////////
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul x15,x5,x4 // a[1]*a[0]
+ umulh x9,x5,x4
+ mul x16,x6,x4 // a[2]*a[0]
+ umulh x10,x6,x4
+ mul x17,x7,x4 // a[3]*a[0]
+ umulh x19,x7,x4
+
+ adds x16,x16,x9 // accumulate high parts of multiplication
+ mul x8,x6,x5 // a[2]*a[1]
+ umulh x9,x6,x5
+ adcs x17,x17,x10
+ mul x10,x7,x5 // a[3]*a[1]
+ umulh x11,x7,x5
+ adc x19,x19,xzr // can't overflow
+
+ mul x20,x7,x6 // a[3]*a[2]
+ umulh x1,x7,x6
+
+ adds x9,x9,x10 // accumulate high parts of multiplication
+ mul x14,x4,x4 // a[0]*a[0]
+ adc x10,x11,xzr // can't overflow
+
+ adds x17,x17,x8 // accumulate low parts of multiplication
+ umulh x4,x4,x4
+ adcs x19,x19,x9
+ mul x9,x5,x5 // a[1]*a[1]
+ adcs x20,x20,x10
+ umulh x5,x5,x5
+ adc x1,x1,xzr // can't overflow
+
+ adds x15,x15,x15 // acc[1-6]*=2
+ mul x10,x6,x6 // a[2]*a[2]
+ adcs x16,x16,x16
+ umulh x6,x6,x6
+ adcs x17,x17,x17
+ mul x11,x7,x7 // a[3]*a[3]
+ adcs x19,x19,x19
+ umulh x7,x7,x7
+ adcs x20,x20,x20
+ adcs x1,x1,x1
+ adc x3,xzr,xzr
+
+ adds x15,x15,x4 // +a[i]*a[i]
+ mul x24,x14,x23
+ adcs x16,x16,x9
+ adcs x17,x17,x5
+ adcs x19,x19,x10
+ adcs x20,x20,x6
+ adcs x1,x1,x11
+ adc x3,x3,x7
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adc x17,xzr,x24 // can't overflow
+ mul x11,x14,x23
+ lsl x8,x24,#32
+ subs x15,x15,x24
+ lsr x9,x24,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x11
+ mul x10,x13,x11
+ umulh x24,x13,x11
+
+ adcs x10,x10,x9
+ adc x24,x24,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x24
+ adcs x16,x17,x11
+ adc x17,xzr,x11 // can't overflow
+ mul x24,x14,x23
+ lsl x8,x11,#32
+ subs x15,x15,x11
+ lsr x9,x11,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adc x17,xzr,x24 // can't overflow
+ mul x11,x14,x23
+ lsl x8,x24,#32
+ subs x15,x15,x24
+ lsr x9,x24,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x11
+ mul x10,x13,x11
+ umulh x24,x13,x11
+
+ adcs x10,x10,x9
+ adc x24,x24,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x24
+ adcs x16,x17,x11
+ adc x17,xzr,x11 // can't overflow
+ lsl x8,x11,#32
+ subs x15,x15,x11
+ lsr x9,x11,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ adds x14,x14,x19 // accumulate upper half
+ adcs x15,x15,x20
+ adcs x16,x16,x1
+ adcs x17,x17,x3
+ adc x19,xzr,xzr
+
+ subs x8,x14,x12 // ret -= modulus
+ sbcs x9,x15,x13
+ sbcs x10,x16,x21
+ sbcs x11,x17,x22
+ sbcs xzr,x19,xzr
+
+ csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x5,x15,x9,lo
+ csel x6,x16,x10,lo
+ csel x7,x17,x11,lo
+
+ cbnz x2,Loop_ord_sqr
+
+ stp x4,x5,[x0]
+ stp x6,x7,[x0,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl _ecp_nistz256_select_w5
+.private_extern _ecp_nistz256_select_w5
+
+.align 4
+_ecp_nistz256_select_w5:
+ AARCH64_VALID_CALL_TARGET
+
+ // x10 := x0
+ // w9 := 0; loop counter and incremented internal index
+ mov x10, x0
+ mov w9, #0
+
+ // [v16-v21] := 0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+
+Lselect_w5_loop:
+ // Loop 16 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add w9, w9, #1
+
+ // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+ // and advance x1 to point to the next entry
+ ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+ // x11 := (w9 == w2)? All 1s : All 0s
+ cmp w9, w2
+ csetm x11, eq
+
+ // continue loading ...
+ ld1 {v26.2d, v27.2d}, [x1],#32
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup v3.2d, x11
+
+ // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+ // i.e., values in output registers will remain the same if w9 != w2
+ bit v16.16b, v22.16b, v3.16b
+ bit v17.16b, v23.16b, v3.16b
+
+ bit v18.16b, v24.16b, v3.16b
+ bit v19.16b, v25.16b, v3.16b
+
+ bit v20.16b, v26.16b, v3.16b
+ bit v21.16b, v27.16b, v3.16b
+
+ // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+ tbz w9, #4, Lselect_w5_loop
+
+ // Write [v16-v21] to memory at the output pointer
+ st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+ st1 {v20.2d, v21.2d}, [x10]
+
+ ret
+
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl _ecp_nistz256_select_w7
+.private_extern _ecp_nistz256_select_w7
+
+.align 4
+_ecp_nistz256_select_w7:
+ AARCH64_VALID_CALL_TARGET
+
+ // w9 := 0; loop counter and incremented internal index
+ mov w9, #0
+
+ // [v16-v21] := 0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+
+Lselect_w7_loop:
+ // Loop 64 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add w9, w9, #1
+
+ // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+ // and advance x1 to point to the next entry
+ ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+ // x11 := (w9 == w2)? All 1s : All 0s
+ cmp w9, w2
+ csetm x11, eq
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup v3.2d, x11
+
+ // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+ // i.e., values in output registers will remain the same if w9 != w2
+ bit v16.16b, v22.16b, v3.16b
+ bit v17.16b, v23.16b, v3.16b
+
+ bit v18.16b, v24.16b, v3.16b
+ bit v19.16b, v25.16b, v3.16b
+
+ // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+ tbz w9, #6, Lselect_w7_loop
+
+ // Write [v16-v19] to memory at the output pointer
+ st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/p256-armv8-asm-linux.S b/gen/bcm/p256-armv8-asm-linux.S
new file mode 100644
index 0000000..28d9ac9
--- /dev/null
+++ b/gen/bcm/p256-armv8-asm-linux.S
@@ -0,0 +1,1726 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include "openssl/arm_arch.h"
+
+.section .rodata
+.align 5
+.Lpoly:
+.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+.LRR: // 2^512 mod P precomputed for NIST P256 polynomial
+.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+.Lone_mont:
+.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+.Lone:
+.quad 1,0,0,0
+.Lord:
+.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
+.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.text
+
+// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl ecp_nistz256_mul_mont
+.hidden ecp_nistz256_mul_mont
+.type ecp_nistz256_mul_mont,%function
+.align 4
+ecp_nistz256_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldr x3,[x2] // bp[0]
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_mul_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_sqr_mont
+.hidden ecp_nistz256_sqr_mont
+.type ecp_nistz256_sqr_mont,%function
+.align 4
+ecp_nistz256_sqr_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sqr_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_div_by_2
+.hidden ecp_nistz256_div_by_2
+.type ecp_nistz256_div_by_2,%function
+.align 4
+ecp_nistz256_div_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_div_by_2
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_mul_by_2
+.hidden ecp_nistz256_mul_by_2
+.type ecp_nistz256_mul_by_2,%function
+.align 4
+ecp_nistz256_mul_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_mul_by_3
+.hidden ecp_nistz256_mul_by_3
+.type ecp_nistz256_mul_by_3,%function
+.align 4
+ecp_nistz256_mul_by_3:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ mov x4,x14
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ mov x8,x4
+ mov x9,x5
+ mov x10,x6
+ mov x11,x7
+
+ bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl ecp_nistz256_sub
+.hidden ecp_nistz256_sub
+.type ecp_nistz256_sub,%function
+.align 4
+ecp_nistz256_sub:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_sub,.-ecp_nistz256_sub
+
+// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_neg
+.hidden ecp_nistz256_neg
+.type ecp_nistz256_neg,%function
+.align 4
+ecp_nistz256_neg:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x2,x1
+ mov x14,xzr // a = 0
+ mov x15,xzr
+ mov x16,xzr
+ mov x17,xzr
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_neg,.-ecp_nistz256_neg
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+.type __ecp_nistz256_mul_mont,%function
+.align 4
+__ecp_nistz256_mul_mont:
+ mul x14,x4,x3 // a[0]*b[0]
+ umulh x8,x4,x3
+
+ mul x15,x5,x3 // a[1]*b[0]
+ umulh x9,x5,x3
+
+ mul x16,x6,x3 // a[2]*b[0]
+ umulh x10,x6,x3
+
+ mul x17,x7,x3 // a[3]*b[0]
+ umulh x11,x7,x3
+ ldr x3,[x2,#8] // b[1]
+
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adc x19,xzr,x11
+ mov x20,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ ldr x3,[x2,#8*(1+1)] // b[1+1]
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ ldr x3,[x2,#8*(2+1)] // b[2+1]
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ // last reduction
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ adcs x17,x19,x11
+ adc x19,x20,xzr
+
+ adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x19,xzr // did it borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+.type __ecp_nistz256_sqr_mont,%function
+.align 4
+__ecp_nistz256_sqr_mont:
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul x15,x5,x4 // a[1]*a[0]
+ umulh x9,x5,x4
+ mul x16,x6,x4 // a[2]*a[0]
+ umulh x10,x6,x4
+ mul x17,x7,x4 // a[3]*a[0]
+ umulh x19,x7,x4
+
+ adds x16,x16,x9 // accumulate high parts of multiplication
+ mul x8,x6,x5 // a[2]*a[1]
+ umulh x9,x6,x5
+ adcs x17,x17,x10
+ mul x10,x7,x5 // a[3]*a[1]
+ umulh x11,x7,x5
+ adc x19,x19,xzr // can't overflow
+
+ mul x20,x7,x6 // a[3]*a[2]
+ umulh x1,x7,x6
+
+ adds x9,x9,x10 // accumulate high parts of multiplication
+ mul x14,x4,x4 // a[0]*a[0]
+ adc x10,x11,xzr // can't overflow
+
+ adds x17,x17,x8 // accumulate low parts of multiplication
+ umulh x4,x4,x4
+ adcs x19,x19,x9
+ mul x9,x5,x5 // a[1]*a[1]
+ adcs x20,x20,x10
+ umulh x5,x5,x5
+ adc x1,x1,xzr // can't overflow
+
+ adds x15,x15,x15 // acc[1-6]*=2
+ mul x10,x6,x6 // a[2]*a[2]
+ adcs x16,x16,x16
+ umulh x6,x6,x6
+ adcs x17,x17,x17
+ mul x11,x7,x7 // a[3]*a[3]
+ adcs x19,x19,x19
+ umulh x7,x7,x7
+ adcs x20,x20,x20
+ adcs x1,x1,x1
+ adc x2,xzr,xzr
+
+ adds x15,x15,x4 // +a[i]*a[i]
+ adcs x16,x16,x9
+ adcs x17,x17,x5
+ adcs x19,x19,x10
+ adcs x20,x20,x6
+ lsl x8,x14,#32
+ adcs x1,x1,x11
+ lsr x9,x14,#32
+ adc x2,x2,x7
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ adc x17,x11,xzr // can't overflow
+
+ adds x14,x14,x19 // accumulate upper half
+ adcs x15,x15,x20
+ adcs x16,x16,x1
+ adcs x17,x17,x2
+ adc x19,xzr,xzr
+
+ adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x19,xzr // did it borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.type __ecp_nistz256_add_to,%function
+.align 4
+__ecp_nistz256_add_to:
+ adds x14,x14,x8 // ret = a+b
+ adcs x15,x15,x9
+ adcs x16,x16,x10
+ adcs x17,x17,x11
+ adc x1,xzr,xzr // zap x1
+
+ adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to
+
+.type __ecp_nistz256_sub_from,%function
+.align 4
+__ecp_nistz256_sub_from:
+ ldp x8,x9,[x2]
+ ldp x10,x11,[x2,#16]
+ subs x14,x14,x8 // ret = a-b
+ sbcs x15,x15,x9
+ sbcs x16,x16,x10
+ sbcs x17,x17,x11
+ sbc x1,xzr,xzr // zap x1
+
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adc x11,x17,x13
+ cmp x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ stp x14,x15,[x0]
+ csel x17,x17,x11,eq
+ stp x16,x17,[x0,#16]
+
+ ret
+.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
+
+.type __ecp_nistz256_sub_morf,%function
+.align 4
+__ecp_nistz256_sub_morf:
+ ldp x8,x9,[x2]
+ ldp x10,x11,[x2,#16]
+ subs x14,x8,x14 // ret = b-a
+ sbcs x15,x9,x15
+ sbcs x16,x10,x16
+ sbcs x17,x11,x17
+ sbc x1,xzr,xzr // zap x1
+
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adc x11,x17,x13
+ cmp x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ stp x14,x15,[x0]
+ csel x17,x17,x11,eq
+ stp x16,x17,[x0,#16]
+
+ ret
+.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
+
+.type __ecp_nistz256_div_by_2,%function
+.align 4
+__ecp_nistz256_div_by_2:
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adcs x11,x17,x13
+ adc x1,xzr,xzr // zap x1
+ tst x14,#1 // is a even?
+
+ csel x14,x14,x8,eq // ret = even ? a : a+modulus
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ csel x17,x17,x11,eq
+ csel x1,xzr,x1,eq
+
+ lsr x14,x14,#1 // ret >>= 1
+ orr x14,x14,x15,lsl#63
+ lsr x15,x15,#1
+ orr x15,x15,x16,lsl#63
+ lsr x16,x16,#1
+ orr x16,x16,x17,lsl#63
+ lsr x17,x17,#1
+ stp x14,x15,[x0]
+ orr x17,x17,x1,lsl#63
+ stp x16,x17,[x0,#16]
+
+ ret
+.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
+.globl ecp_nistz256_point_double
+.hidden ecp_nistz256_point_double
+.type ecp_nistz256_point_double,%function
+.align 5
+ecp_nistz256_point_double:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ sub sp,sp,#32*4
+
+.Ldouble_shortcut:
+ ldp x14,x15,[x1,#32]
+ mov x21,x0
+ ldp x16,x17,[x1,#48]
+ mov x22,x1
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ mov x8,x14
+ ldr x13,[x13,#24]
+ mov x9,x15
+ ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[x22,#64+16]
+ add x0,sp,#0
+ bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y);
+
+ add x0,sp,#64
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
+
+ ldp x8,x9,[x22]
+ ldp x10,x11,[x22,#16]
+ mov x4,x14 // put Zsqr aside for p256_sub
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x0,sp,#32
+ bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x);
+
+ add x2,x22,#0
+ mov x14,x4 // restore Zsqr
+ mov x15,x5
+ ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
+ mov x16,x6
+ mov x17,x7
+ ldp x6,x7,[sp,#0+16]
+ add x0,sp,#64
+ bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
+
+ add x0,sp,#0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
+
+ ldr x3,[x22,#32]
+ ldp x4,x5,[x22,#64]
+ ldp x6,x7,[x22,#64+16]
+ add x2,x22,#32
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
+
+ mov x8,x14
+ mov x9,x15
+ ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[sp,#0+16]
+ add x0,x21,#64
+ bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0);
+
+ add x0,sp,#96
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
+
+ ldr x3,[sp,#64] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x0,x21,#32
+ bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
+
+ add x2,sp,#64
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
+
+ mov x8,x14 // duplicate M
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ mov x4,x14 // put M aside
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x0,sp,#32
+ bl __ecp_nistz256_add_to
+ mov x8,x4 // restore M
+ mov x9,x5
+ ldr x3,[x22] // forward load for p256_mul_mont
+ mov x10,x6
+ ldp x4,x5,[sp,#0]
+ mov x11,x7
+ ldp x6,x7,[sp,#0+16]
+ bl __ecp_nistz256_add_to // p256_mul_by_3(M, M);
+
+ add x2,x22,#0
+ add x0,sp,#0
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
+
+ mov x8,x14
+ mov x9,x15
+ ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[sp,#32+16]
+ add x0,sp,#96
+ bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S);
+
+ add x0,x21,#0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
+
+ add x2,sp,#96
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
+
+ add x2,sp,#0
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
+
+ ldr x3,[sp,#32]
+ mov x4,x14 // copy S
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x2,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
+
+ add x2,x21,#32
+ add x0,x21,#32
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl ecp_nistz256_point_add
+.hidden ecp_nistz256_point_add
+.type ecp_nistz256_point_add,%function
+.align 5
+ecp_nistz256_point_add:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#32*12
+
+ ldp x4,x5,[x2,#64] // in2_z
+ ldp x6,x7,[x2,#64+16]
+ mov x21,x0
+ mov x22,x1
+ mov x23,x2
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x25,x8,x10
+ cmp x25,#0
+ csetm x25,ne // ~in2infty
+ add x0,sp,#192
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
+
+ ldp x4,x5,[x22,#64] // in1_z
+ ldp x6,x7,[x22,#64+16]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x24,x8,x10
+ cmp x24,#0
+ csetm x24,ne // ~in1infty
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ ldr x3,[x23,#64]
+ ldp x4,x5,[sp,#192]
+ ldp x6,x7,[sp,#192+16]
+ add x2,x23,#64
+ add x0,sp,#320
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,x22,#64
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr x3,[x22,#32]
+ ldp x4,x5,[sp,#320]
+ ldp x6,x7,[sp,#320+16]
+ add x2,x22,#32
+ add x0,sp,#320
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
+
+ ldr x3,[x23,#32]
+ ldp x4,x5,[sp,#352]
+ ldp x6,x7,[sp,#352+16]
+ add x2,x23,#32
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add x2,sp,#320
+ ldr x3,[sp,#192] // forward load for p256_mul_mont
+ ldp x4,x5,[x22]
+ ldp x6,x7,[x22,#16]
+ add x0,sp,#160
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
+
+ orr x14,x14,x15 // see if result is zero
+ orr x16,x16,x17
+ orr x26,x14,x16 // ~is_equal(S1,S2)
+
+ add x2,sp,#192
+ add x0,sp,#256
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
+
+ ldr x3,[sp,#128]
+ ldp x4,x5,[x23]
+ ldp x6,x7,[x23,#16]
+ add x2,sp,#128
+ add x0,sp,#288
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
+
+ add x2,sp,#256
+ ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
+ ldp x6,x7,[sp,#160+16]
+ add x0,sp,#96
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
+
+ orr x14,x14,x15 // see if result is zero
+ orr x16,x16,x17
+ orr x14,x14,x16 // ~is_equal(U1,U2)
+
+ mvn x27,x24 // -1/0 -> 0/-1
+ mvn x28,x25 // -1/0 -> 0/-1
+ orr x14,x14,x27
+ orr x14,x14,x28
+ orr x14,x14,x26
+ cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+.Ladd_double:
+ mov x1,x22
+ mov x0,x21
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames
+ b .Ldouble_shortcut
+
+.align 4
+.Ladd_proceed:
+ add x0,sp,#192
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#96]
+ ldp x6,x7,[sp,#96+16]
+ add x2,x22,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldp x4,x5,[sp,#96]
+ ldp x6,x7,[sp,#96+16]
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldr x3,[x23,#64]
+ ldp x4,x5,[sp,#64]
+ ldp x6,x7,[sp,#64+16]
+ add x2,x23,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
+
+ ldr x3,[sp,#96]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,sp,#96
+ add x0,sp,#224
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr x3,[sp,#128]
+ ldp x4,x5,[sp,#256]
+ ldp x6,x7,[sp,#256+16]
+ add x2,sp,#128
+ add x0,sp,#288
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
+
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ add x0,sp,#128
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add x2,sp,#192
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add x2,sp,#224
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add x2,sp,#288
+ ldr x3,[sp,#224] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#320]
+ ldp x6,x7,[sp,#320+16]
+ add x0,sp,#32
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add x2,sp,#224
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
+
+ ldr x3,[sp,#160]
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x2,sp,#160
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add x2,sp,#352
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp x4,x5,[sp,#0] // res
+ ldp x6,x7,[sp,#0+16]
+ ldp x8,x9,[x23] // in2
+ ldp x10,x11,[x23,#16]
+ ldp x14,x15,[x22,#0] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#0+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+0+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+0+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#0+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#0+48]
+ stp x14,x15,[x21,#0]
+ stp x16,x17,[x21,#0+16]
+ ldp x14,x15,[x22,#32] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#32+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+32+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+32+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#32+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#32+48]
+ stp x14,x15,[x21,#32]
+ stp x16,x17,[x21,#32+16]
+ ldp x14,x15,[x22,#64] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#64+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ stp x14,x15,[x21,#64]
+ stp x16,x17,[x21,#64+16]
+
+.Ladd_done:
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl ecp_nistz256_point_add_affine
+.hidden ecp_nistz256_point_add_affine
+.type ecp_nistz256_point_add_affine,%function
+.align 5
+ecp_nistz256_point_add_affine:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ sub sp,sp,#32*10
+
+ mov x21,x0
+ mov x22,x1
+ mov x23,x2
+ adrp x13,.Lpoly
+ add x13,x13,:lo12:.Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ ldp x4,x5,[x1,#64] // in1_z
+ ldp x6,x7,[x1,#64+16]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x24,x8,x10
+ cmp x24,#0
+ csetm x24,ne // ~in1infty
+
+ ldp x14,x15,[x2] // in2_x
+ ldp x16,x17,[x2,#16]
+ ldp x8,x9,[x2,#32] // in2_y
+ ldp x10,x11,[x2,#48]
+ orr x14,x14,x15
+ orr x16,x16,x17
+ orr x8,x8,x9
+ orr x10,x10,x11
+ orr x14,x14,x16
+ orr x8,x8,x10
+ orr x25,x14,x8
+ cmp x25,#0
+ csetm x25,ne // ~in2infty
+
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ mov x4,x14
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ ldr x3,[x23]
+ add x2,x23,#0
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
+
+ add x2,x22,#0
+ ldr x3,[x22,#64] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x0,sp,#160
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
+
+ add x2,x22,#64
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#160]
+ ldp x6,x7,[sp,#160+16]
+ add x2,x22,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldr x3,[x23,#32]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,x23,#32
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add x2,x22,#32
+ ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
+ ldp x6,x7,[sp,#160+16]
+ add x0,sp,#192
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
+
+ add x0,sp,#224
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldp x4,x5,[sp,#192]
+ ldp x6,x7,[sp,#192+16]
+ add x0,sp,#288
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr x3,[sp,#160]
+ ldp x4,x5,[sp,#224]
+ ldp x6,x7,[sp,#224+16]
+ add x2,sp,#160
+ add x0,sp,#256
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr x3,[x22]
+ ldp x4,x5,[sp,#224]
+ ldp x6,x7,[sp,#224+16]
+ add x2,x22,#0
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
+
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ add x0,sp,#224
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add x2,sp,#288
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add x2,sp,#256
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add x2,sp,#96
+ ldr x3,[x22,#32] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#256]
+ ldp x6,x7,[sp,#256+16]
+ add x0,sp,#32
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add x2,x22,#32
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
+
+ ldr x3,[sp,#192]
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x2,sp,#192
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add x2,sp,#128
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp x4,x5,[sp,#0] // res
+ ldp x6,x7,[sp,#0+16]
+ ldp x8,x9,[x23] // in2
+ ldp x10,x11,[x23,#16]
+ ldp x14,x15,[x22,#0] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#0+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+0+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+0+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#0+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#0+48]
+ stp x14,x15,[x21,#0]
+ stp x16,x17,[x21,#0+16]
+ adrp x23,.Lone_mont-64
+ add x23,x23,:lo12:.Lone_mont-64
+ ldp x14,x15,[x22,#32] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#32+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+32+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+32+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#32+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#32+48]
+ stp x14,x15,[x21,#32]
+ stp x16,x17,[x21,#32+16]
+ ldp x14,x15,[x22,#64] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#64+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ stp x14,x15,[x21,#64]
+ stp x16,x17,[x21,#64+16]
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x29,x30,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t b[4]);
+.globl ecp_nistz256_ord_mul_mont
+.hidden ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,%function
+.align 4
+ecp_nistz256_ord_mul_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adrp x23,.Lord
+ add x23,x23,:lo12:.Lord
+ ldr x3,[x2] // bp[0]
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+
+ ldp x12,x13,[x23,#0]
+ ldp x21,x22,[x23,#16]
+ ldr x23,[x23,#32]
+
+ mul x14,x4,x3 // a[0]*b[0]
+ umulh x8,x4,x3
+
+ mul x15,x5,x3 // a[1]*b[0]
+ umulh x9,x5,x3
+
+ mul x16,x6,x3 // a[2]*b[0]
+ umulh x10,x6,x3
+
+ mul x17,x7,x3 // a[3]*b[0]
+ umulh x19,x7,x3
+
+ mul x24,x14,x23
+
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adc x19,x19,xzr
+ mov x20,xzr
+ ldr x3,[x2,#8*1] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ ldr x3,[x2,#8*2] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ ldr x3,[x2,#8*3] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ lsl x8,x24,#32 // last reduction
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ subs x8,x14,x12 // ret -= modulus
+ sbcs x9,x15,x13
+ sbcs x10,x16,x21
+ sbcs x11,x17,x22
+ sbcs xzr,x19,xzr
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t rep);
+.globl ecp_nistz256_ord_sqr_mont
+.hidden ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,%function
+.align 4
+ecp_nistz256_ord_sqr_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adrp x23,.Lord
+ add x23,x23,:lo12:.Lord
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+
+ ldp x12,x13,[x23,#0]
+ ldp x21,x22,[x23,#16]
+ ldr x23,[x23,#32]
+ b .Loop_ord_sqr
+
+.align 4
+.Loop_ord_sqr:
+ sub x2,x2,#1
+ ////////////////////////////////////////////////////////////////
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul x15,x5,x4 // a[1]*a[0]
+ umulh x9,x5,x4
+ mul x16,x6,x4 // a[2]*a[0]
+ umulh x10,x6,x4
+ mul x17,x7,x4 // a[3]*a[0]
+ umulh x19,x7,x4
+
+ adds x16,x16,x9 // accumulate high parts of multiplication
+ mul x8,x6,x5 // a[2]*a[1]
+ umulh x9,x6,x5
+ adcs x17,x17,x10
+ mul x10,x7,x5 // a[3]*a[1]
+ umulh x11,x7,x5
+ adc x19,x19,xzr // can't overflow
+
+ mul x20,x7,x6 // a[3]*a[2]
+ umulh x1,x7,x6
+
+ adds x9,x9,x10 // accumulate high parts of multiplication
+ mul x14,x4,x4 // a[0]*a[0]
+ adc x10,x11,xzr // can't overflow
+
+ adds x17,x17,x8 // accumulate low parts of multiplication
+ umulh x4,x4,x4
+ adcs x19,x19,x9
+ mul x9,x5,x5 // a[1]*a[1]
+ adcs x20,x20,x10
+ umulh x5,x5,x5
+ adc x1,x1,xzr // can't overflow
+
+ adds x15,x15,x15 // acc[1-6]*=2
+ mul x10,x6,x6 // a[2]*a[2]
+ adcs x16,x16,x16
+ umulh x6,x6,x6
+ adcs x17,x17,x17
+ mul x11,x7,x7 // a[3]*a[3]
+ adcs x19,x19,x19
+ umulh x7,x7,x7
+ adcs x20,x20,x20
+ adcs x1,x1,x1
+ adc x3,xzr,xzr
+
+ adds x15,x15,x4 // +a[i]*a[i]
+ mul x24,x14,x23
+ adcs x16,x16,x9
+ adcs x17,x17,x5
+ adcs x19,x19,x10
+ adcs x20,x20,x6
+ adcs x1,x1,x11
+ adc x3,x3,x7
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adc x17,xzr,x24 // can't overflow
+ mul x11,x14,x23
+ lsl x8,x24,#32
+ subs x15,x15,x24
+ lsr x9,x24,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x11
+ mul x10,x13,x11
+ umulh x24,x13,x11
+
+ adcs x10,x10,x9
+ adc x24,x24,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x24
+ adcs x16,x17,x11
+ adc x17,xzr,x11 // can't overflow
+ mul x24,x14,x23
+ lsl x8,x11,#32
+ subs x15,x15,x11
+ lsr x9,x11,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adc x17,xzr,x24 // can't overflow
+ mul x11,x14,x23
+ lsl x8,x24,#32
+ subs x15,x15,x24
+ lsr x9,x24,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x11
+ mul x10,x13,x11
+ umulh x24,x13,x11
+
+ adcs x10,x10,x9
+ adc x24,x24,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x24
+ adcs x16,x17,x11
+ adc x17,xzr,x11 // can't overflow
+ lsl x8,x11,#32
+ subs x15,x15,x11
+ lsr x9,x11,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ adds x14,x14,x19 // accumulate upper half
+ adcs x15,x15,x20
+ adcs x16,x16,x1
+ adcs x17,x17,x3
+ adc x19,xzr,xzr
+
+ subs x8,x14,x12 // ret -= modulus
+ sbcs x9,x15,x13
+ sbcs x10,x16,x21
+ sbcs x11,x17,x22
+ sbcs xzr,x19,xzr
+
+ csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x5,x15,x9,lo
+ csel x6,x16,x10,lo
+ csel x7,x17,x11,lo
+
+ cbnz x2,.Loop_ord_sqr
+
+ stp x4,x5,[x0]
+ stp x6,x7,[x0,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w5
+.hidden ecp_nistz256_select_w5
+.type ecp_nistz256_select_w5,%function
+.align 4
+ecp_nistz256_select_w5:
+ AARCH64_VALID_CALL_TARGET
+
+ // x10 := x0
+ // w9 := 0; loop counter and incremented internal index
+ mov x10, x0
+ mov w9, #0
+
+ // [v16-v21] := 0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+
+.Lselect_w5_loop:
+ // Loop 16 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add w9, w9, #1
+
+ // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+ // and advance x1 to point to the next entry
+ ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+ // x11 := (w9 == w2)? All 1s : All 0s
+ cmp w9, w2
+ csetm x11, eq
+
+ // continue loading ...
+ ld1 {v26.2d, v27.2d}, [x1],#32
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup v3.2d, x11
+
+ // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+ // i.e., values in output registers will remain the same if w9 != w2
+ bit v16.16b, v22.16b, v3.16b
+ bit v17.16b, v23.16b, v3.16b
+
+ bit v18.16b, v24.16b, v3.16b
+ bit v19.16b, v25.16b, v3.16b
+
+ bit v20.16b, v26.16b, v3.16b
+ bit v21.16b, v27.16b, v3.16b
+
+ // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+ tbz w9, #4, .Lselect_w5_loop
+
+ // Write [v16-v21] to memory at the output pointer
+ st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+ st1 {v20.2d, v21.2d}, [x10]
+
+ ret
+.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w7
+.hidden ecp_nistz256_select_w7
+.type ecp_nistz256_select_w7,%function
+.align 4
+ecp_nistz256_select_w7:
+ AARCH64_VALID_CALL_TARGET
+
+ // w9 := 0; loop counter and incremented internal index
+ mov w9, #0
+
+ // [v16-v21] := 0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+
+.Lselect_w7_loop:
+ // Loop 64 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add w9, w9, #1
+
+ // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+ // and advance x1 to point to the next entry
+ ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+ // x11 := (w9 == w2)? All 1s : All 0s
+ cmp w9, w2
+ csetm x11, eq
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup v3.2d, x11
+
+ // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+ // i.e., values in output registers will remain the same if w9 != w2
+ bit v16.16b, v22.16b, v3.16b
+ bit v17.16b, v23.16b, v3.16b
+
+ bit v18.16b, v24.16b, v3.16b
+ bit v19.16b, v25.16b, v3.16b
+
+ // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+ tbz w9, #6, .Lselect_w7_loop
+
+ // Write [v16-v19] to memory at the output pointer
+ st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+ ret
+.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/p256-armv8-asm-win.S b/gen/bcm/p256-armv8-asm-win.S
new file mode 100644
index 0000000..a55d20d
--- /dev/null
+++ b/gen/bcm/p256-armv8-asm-win.S
@@ -0,0 +1,1766 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include "openssl/arm_arch.h"
+
+.section .rodata
+.align 5
+Lpoly:
+.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+LRR: // 2^512 mod P precomputed for NIST P256 polynomial
+.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
+Lone_mont:
+.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+Lone:
+.quad 1,0,0,0
+Lord:
+.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+LordK:
+.quad 0xccd1c8aaee00bc4f
+.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.text
+
+// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl ecp_nistz256_mul_mont
+
+.def ecp_nistz256_mul_mont
+ .type 32
+.endef
+.align 4
+ecp_nistz256_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldr x3,[x2] // bp[0]
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_mul_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_sqr_mont
+
+.def ecp_nistz256_sqr_mont
+ .type 32
+.endef
+.align 4
+ecp_nistz256_sqr_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sqr_mont
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_div_by_2
+
+.def ecp_nistz256_div_by_2
+ .type 32
+.endef
+.align 4
+ecp_nistz256_div_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_div_by_2
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_mul_by_2
+
+.def ecp_nistz256_mul_by_2
+ .type 32
+.endef
+.align 4
+ecp_nistz256_mul_by_2:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_mul_by_3
+
+.def ecp_nistz256_mul_by_3
+ .type 32
+.endef
+.align 4
+ecp_nistz256_mul_by_3:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ mov x4,x14
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+
+ bl __ecp_nistz256_add_to // ret = a+a // 2*a
+
+ mov x8,x4
+ mov x9,x5
+ mov x10,x6
+ mov x11,x7
+
+ bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
+// const BN_ULONG x2[4]);
+.globl ecp_nistz256_sub
+
+.def ecp_nistz256_sub
+ .type 32
+.endef
+.align 4
+ecp_nistz256_sub:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ldp x14,x15,[x1]
+ ldp x16,x17,[x1,#16]
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
+.globl ecp_nistz256_neg
+
+.def ecp_nistz256_neg
+ .type 32
+.endef
+.align 4
+ecp_nistz256_neg:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x2,x1
+ mov x14,xzr // a = 0
+ mov x15,xzr
+ mov x16,xzr
+ mov x17,xzr
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
+// to x4-x7 and b[0] - to x3
+.def __ecp_nistz256_mul_mont
+ .type 32
+.endef
+.align 4
+__ecp_nistz256_mul_mont:
+ mul x14,x4,x3 // a[0]*b[0]
+ umulh x8,x4,x3
+
+ mul x15,x5,x3 // a[1]*b[0]
+ umulh x9,x5,x3
+
+ mul x16,x6,x3 // a[2]*b[0]
+ umulh x10,x6,x3
+
+ mul x17,x7,x3 // a[3]*b[0]
+ umulh x11,x7,x3
+ ldr x3,[x2,#8] // b[1]
+
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adc x19,xzr,x11
+ mov x20,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ ldr x3,[x2,#8*(1+1)] // b[1+1]
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ ldr x3,[x2,#8*(2+1)] // b[2+1]
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ mul x8,x4,x3 // lo(a[0]*b[i])
+ adcs x15,x16,x9
+ mul x9,x5,x3 // lo(a[1]*b[i])
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ mul x10,x6,x3 // lo(a[2]*b[i])
+ adcs x17,x19,x11
+ mul x11,x7,x3 // lo(a[3]*b[i])
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts of multiplication
+ umulh x8,x4,x3 // hi(a[0]*b[i])
+ adcs x15,x15,x9
+ umulh x9,x5,x3 // hi(a[1]*b[i])
+ adcs x16,x16,x10
+ umulh x10,x6,x3 // hi(a[2]*b[i])
+ adcs x17,x17,x11
+ umulh x11,x7,x3 // hi(a[3]*b[i])
+ adc x19,x19,xzr
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ lsl x8,x14,#32
+ adcs x16,x16,x9
+ lsr x9,x14,#32
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ // last reduction
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ adcs x17,x19,x11
+ adc x19,x20,xzr
+
+ adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x19,xzr // did it borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
+// to x4-x7
+.def __ecp_nistz256_sqr_mont
+ .type 32
+.endef
+.align 4
+__ecp_nistz256_sqr_mont:
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul x15,x5,x4 // a[1]*a[0]
+ umulh x9,x5,x4
+ mul x16,x6,x4 // a[2]*a[0]
+ umulh x10,x6,x4
+ mul x17,x7,x4 // a[3]*a[0]
+ umulh x19,x7,x4
+
+ adds x16,x16,x9 // accumulate high parts of multiplication
+ mul x8,x6,x5 // a[2]*a[1]
+ umulh x9,x6,x5
+ adcs x17,x17,x10
+ mul x10,x7,x5 // a[3]*a[1]
+ umulh x11,x7,x5
+ adc x19,x19,xzr // can't overflow
+
+ mul x20,x7,x6 // a[3]*a[2]
+ umulh x1,x7,x6
+
+ adds x9,x9,x10 // accumulate high parts of multiplication
+ mul x14,x4,x4 // a[0]*a[0]
+ adc x10,x11,xzr // can't overflow
+
+ adds x17,x17,x8 // accumulate low parts of multiplication
+ umulh x4,x4,x4
+ adcs x19,x19,x9
+ mul x9,x5,x5 // a[1]*a[1]
+ adcs x20,x20,x10
+ umulh x5,x5,x5
+ adc x1,x1,xzr // can't overflow
+
+ adds x15,x15,x15 // acc[1-6]*=2
+ mul x10,x6,x6 // a[2]*a[2]
+ adcs x16,x16,x16
+ umulh x6,x6,x6
+ adcs x17,x17,x17
+ mul x11,x7,x7 // a[3]*a[3]
+ adcs x19,x19,x19
+ umulh x7,x7,x7
+ adcs x20,x20,x20
+ adcs x1,x1,x1
+ adc x2,xzr,xzr
+
+ adds x15,x15,x4 // +a[i]*a[i]
+ adcs x16,x16,x9
+ adcs x17,x17,x5
+ adcs x19,x19,x10
+ adcs x20,x20,x6
+ lsl x8,x14,#32
+ adcs x1,x1,x11
+ lsr x9,x14,#32
+ adc x2,x2,x7
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ lsl x8,x14,#32
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ lsr x9,x14,#32
+ adc x17,x11,xzr // can't overflow
+ subs x10,x14,x8 // "*0xffff0001"
+ sbc x11,x14,x9
+ adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0]
+ adcs x15,x16,x9
+ adcs x16,x17,x10 // +=acc[0]*0xffff0001
+ adc x17,x11,xzr // can't overflow
+
+ adds x14,x14,x19 // accumulate upper half
+ adcs x15,x15,x20
+ adcs x16,x16,x1
+ adcs x17,x17,x2
+ adc x19,xzr,xzr
+
+ adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x19,xzr // did it borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
+// x4-x7 and x8-x11. This is done because it's used in multiple
+// contexts, e.g. in multiplication by 2 and 3...
+.def __ecp_nistz256_add_to
+ .type 32
+.endef
+.align 4
+__ecp_nistz256_add_to:
+ adds x14,x14,x8 // ret = a+b
+ adcs x15,x15,x9
+ adcs x16,x16,x10
+ adcs x17,x17,x11
+ adc x1,xzr,xzr // zap x1
+
+ adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus
+ sbcs x9,x15,x12
+ sbcs x10,x16,xzr
+ sbcs x11,x17,x13
+ sbcs xzr,x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+.def __ecp_nistz256_sub_from
+ .type 32
+.endef
+.align 4
+__ecp_nistz256_sub_from:
+ ldp x8,x9,[x2]
+ ldp x10,x11,[x2,#16]
+ subs x14,x14,x8 // ret = a-b
+ sbcs x15,x15,x9
+ sbcs x16,x16,x10
+ sbcs x17,x17,x11
+ sbc x1,xzr,xzr // zap x1
+
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adc x11,x17,x13
+ cmp x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ stp x14,x15,[x0]
+ csel x17,x17,x11,eq
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+.def __ecp_nistz256_sub_morf
+ .type 32
+.endef
+.align 4
+__ecp_nistz256_sub_morf:
+ ldp x8,x9,[x2]
+ ldp x10,x11,[x2,#16]
+ subs x14,x8,x14 // ret = b-a
+ sbcs x15,x9,x15
+ sbcs x16,x10,x16
+ sbcs x17,x11,x17
+ sbc x1,xzr,xzr // zap x1
+
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adc x11,x17,x13
+ cmp x1,xzr // did subtraction borrow?
+
+ csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ stp x14,x15,[x0]
+ csel x17,x17,x11,eq
+ stp x16,x17,[x0,#16]
+
+ ret
+
+
+.def __ecp_nistz256_div_by_2
+ .type 32
+.endef
+.align 4
+__ecp_nistz256_div_by_2:
+ subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus
+ adcs x9,x15,x12
+ adcs x10,x16,xzr
+ adcs x11,x17,x13
+ adc x1,xzr,xzr // zap x1
+ tst x14,#1 // is a even?
+
+ csel x14,x14,x8,eq // ret = even ? a : a+modulus
+ csel x15,x15,x9,eq
+ csel x16,x16,x10,eq
+ csel x17,x17,x11,eq
+ csel x1,xzr,x1,eq
+
+ lsr x14,x14,#1 // ret >>= 1
+ orr x14,x14,x15,lsl#63
+ lsr x15,x15,#1
+ orr x15,x15,x16,lsl#63
+ lsr x16,x16,#1
+ orr x16,x16,x17,lsl#63
+ lsr x17,x17,#1
+ stp x14,x15,[x0]
+ orr x17,x17,x1,lsl#63
+ stp x16,x17,[x0,#16]
+
+ ret
+
+.globl ecp_nistz256_point_double
+
+.def ecp_nistz256_point_double
+ .type 32
+.endef
+.align 5
+ecp_nistz256_point_double:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ sub sp,sp,#32*4
+
+Ldouble_shortcut:
+ ldp x14,x15,[x1,#32]
+ mov x21,x0
+ ldp x16,x17,[x1,#48]
+ mov x22,x1
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ mov x8,x14
+ ldr x13,[x13,#24]
+ mov x9,x15
+ ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[x22,#64+16]
+ add x0,sp,#0
+ bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y);
+
+ add x0,sp,#64
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
+
+ ldp x8,x9,[x22]
+ ldp x10,x11,[x22,#16]
+ mov x4,x14 // put Zsqr aside for p256_sub
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x0,sp,#32
+ bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x);
+
+ add x2,x22,#0
+ mov x14,x4 // restore Zsqr
+ mov x15,x5
+ ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
+ mov x16,x6
+ mov x17,x7
+ ldp x6,x7,[sp,#0+16]
+ add x0,sp,#64
+ bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
+
+ add x0,sp,#0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
+
+ ldr x3,[x22,#32]
+ ldp x4,x5,[x22,#64]
+ ldp x6,x7,[x22,#64+16]
+ add x2,x22,#32
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
+
+ mov x8,x14
+ mov x9,x15
+ ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[sp,#0+16]
+ add x0,x21,#64
+ bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0);
+
+ add x0,sp,#96
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
+
+ ldr x3,[sp,#64] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x0,x21,#32
+ bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
+
+ add x2,sp,#64
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
+
+ mov x8,x14 // duplicate M
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ mov x4,x14 // put M aside
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x0,sp,#32
+ bl __ecp_nistz256_add_to
+ mov x8,x4 // restore M
+ mov x9,x5
+ ldr x3,[x22] // forward load for p256_mul_mont
+ mov x10,x6
+ ldp x4,x5,[sp,#0]
+ mov x11,x7
+ ldp x6,x7,[sp,#0+16]
+ bl __ecp_nistz256_add_to // p256_mul_by_3(M, M);
+
+ add x2,x22,#0
+ add x0,sp,#0
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
+
+ mov x8,x14
+ mov x9,x15
+ ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont
+ mov x10,x16
+ mov x11,x17
+ ldp x6,x7,[sp,#32+16]
+ add x0,sp,#96
+ bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S);
+
+ add x0,x21,#0
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
+
+ add x2,sp,#96
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
+
+ add x2,sp,#0
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
+
+ ldr x3,[sp,#32]
+ mov x4,x14 // copy S
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ add x2,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
+
+ add x2,x21,#32
+ add x0,x21,#32
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl ecp_nistz256_point_add
+
+.def ecp_nistz256_point_add
+ .type 32
+.endef
+.align 5
+ecp_nistz256_point_add:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#32*12
+
+ ldp x4,x5,[x2,#64] // in2_z
+ ldp x6,x7,[x2,#64+16]
+ mov x21,x0
+ mov x22,x1
+ mov x23,x2
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x25,x8,x10
+ cmp x25,#0
+ csetm x25,ne // ~in2infty
+ add x0,sp,#192
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
+
+ ldp x4,x5,[x22,#64] // in1_z
+ ldp x6,x7,[x22,#64+16]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x24,x8,x10
+ cmp x24,#0
+ csetm x24,ne // ~in1infty
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ ldr x3,[x23,#64]
+ ldp x4,x5,[sp,#192]
+ ldp x6,x7,[sp,#192+16]
+ add x2,x23,#64
+ add x0,sp,#320
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,x22,#64
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr x3,[x22,#32]
+ ldp x4,x5,[sp,#320]
+ ldp x6,x7,[sp,#320+16]
+ add x2,x22,#32
+ add x0,sp,#320
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
+
+ ldr x3,[x23,#32]
+ ldp x4,x5,[sp,#352]
+ ldp x6,x7,[sp,#352+16]
+ add x2,x23,#32
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add x2,sp,#320
+ ldr x3,[sp,#192] // forward load for p256_mul_mont
+ ldp x4,x5,[x22]
+ ldp x6,x7,[x22,#16]
+ add x0,sp,#160
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
+
+ orr x14,x14,x15 // see if result is zero
+ orr x16,x16,x17
+ orr x26,x14,x16 // ~is_equal(S1,S2)
+
+ add x2,sp,#192
+ add x0,sp,#256
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
+
+ ldr x3,[sp,#128]
+ ldp x4,x5,[x23]
+ ldp x6,x7,[x23,#16]
+ add x2,sp,#128
+ add x0,sp,#288
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
+
+ add x2,sp,#256
+ ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
+ ldp x6,x7,[sp,#160+16]
+ add x0,sp,#96
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
+
+ orr x14,x14,x15 // see if result is zero
+ orr x16,x16,x17
+ orr x14,x14,x16 // ~is_equal(U1,U2)
+
+ mvn x27,x24 // -1/0 -> 0/-1
+ mvn x28,x25 // -1/0 -> 0/-1
+ orr x14,x14,x27
+ orr x14,x14,x28
+ orr x14,x14,x26
+ cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
+
+Ladd_double:
+ mov x1,x22
+ mov x0,x21
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames
+ b Ldouble_shortcut
+
+.align 4
+Ladd_proceed:
+ add x0,sp,#192
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#96]
+ ldp x6,x7,[sp,#96+16]
+ add x2,x22,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldp x4,x5,[sp,#96]
+ ldp x6,x7,[sp,#96+16]
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldr x3,[x23,#64]
+ ldp x4,x5,[sp,#64]
+ ldp x6,x7,[sp,#64+16]
+ add x2,x23,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
+
+ ldr x3,[sp,#96]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,sp,#96
+ add x0,sp,#224
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr x3,[sp,#128]
+ ldp x4,x5,[sp,#256]
+ ldp x6,x7,[sp,#256+16]
+ add x2,sp,#128
+ add x0,sp,#288
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
+
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ add x0,sp,#128
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add x2,sp,#192
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add x2,sp,#224
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add x2,sp,#288
+ ldr x3,[sp,#224] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#320]
+ ldp x6,x7,[sp,#320+16]
+ add x0,sp,#32
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add x2,sp,#224
+ add x0,sp,#352
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
+
+ ldr x3,[sp,#160]
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x2,sp,#160
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add x2,sp,#352
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp x4,x5,[sp,#0] // res
+ ldp x6,x7,[sp,#0+16]
+ ldp x8,x9,[x23] // in2
+ ldp x10,x11,[x23,#16]
+ ldp x14,x15,[x22,#0] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#0+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+0+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+0+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#0+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#0+48]
+ stp x14,x15,[x21,#0]
+ stp x16,x17,[x21,#0+16]
+ ldp x14,x15,[x22,#32] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#32+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+32+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+32+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#32+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#32+48]
+ stp x14,x15,[x21,#32]
+ stp x16,x17,[x21,#32+16]
+ ldp x14,x15,[x22,#64] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#64+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ stp x14,x15,[x21,#64]
+ stp x16,x17,[x21,#64+16]
+
+Ladd_done:
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl ecp_nistz256_point_add_affine
+
+.def ecp_nistz256_point_add_affine
+ .type 32
+.endef
+.align 5
+ecp_nistz256_point_add_affine:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ sub sp,sp,#32*10
+
+ mov x21,x0
+ mov x22,x1
+ mov x23,x2
+ adrp x13,Lpoly
+ add x13,x13,:lo12:Lpoly
+ ldr x12,[x13,#8]
+ ldr x13,[x13,#24]
+
+ ldp x4,x5,[x1,#64] // in1_z
+ ldp x6,x7,[x1,#64+16]
+ orr x8,x4,x5
+ orr x10,x6,x7
+ orr x24,x8,x10
+ cmp x24,#0
+ csetm x24,ne // ~in1infty
+
+ ldp x14,x15,[x2] // in2_x
+ ldp x16,x17,[x2,#16]
+ ldp x8,x9,[x2,#32] // in2_y
+ ldp x10,x11,[x2,#48]
+ orr x14,x14,x15
+ orr x16,x16,x17
+ orr x8,x8,x9
+ orr x10,x10,x11
+ orr x14,x14,x16
+ orr x8,x8,x10
+ orr x25,x14,x8
+ cmp x25,#0
+ csetm x25,ne // ~in2infty
+
+ add x0,sp,#128
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
+
+ mov x4,x14
+ mov x5,x15
+ mov x6,x16
+ mov x7,x17
+ ldr x3,[x23]
+ add x2,x23,#0
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
+
+ add x2,x22,#0
+ ldr x3,[x22,#64] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x0,sp,#160
+ bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
+
+ add x2,x22,#64
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
+
+ ldr x3,[x22,#64]
+ ldp x4,x5,[sp,#160]
+ ldp x6,x7,[sp,#160+16]
+ add x2,x22,#64
+ add x0,sp,#64
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
+
+ ldr x3,[x23,#32]
+ ldp x4,x5,[sp,#128]
+ ldp x6,x7,[sp,#128+16]
+ add x2,x23,#32
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
+
+ add x2,x22,#32
+ ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont
+ ldp x6,x7,[sp,#160+16]
+ add x0,sp,#192
+ bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
+
+ add x0,sp,#224
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
+
+ ldp x4,x5,[sp,#192]
+ ldp x6,x7,[sp,#192+16]
+ add x0,sp,#288
+ bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
+
+ ldr x3,[sp,#160]
+ ldp x4,x5,[sp,#224]
+ ldp x6,x7,[sp,#224+16]
+ add x2,sp,#160
+ add x0,sp,#256
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
+
+ ldr x3,[x22]
+ ldp x4,x5,[sp,#224]
+ ldp x6,x7,[sp,#224+16]
+ add x2,x22,#0
+ add x0,sp,#96
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
+
+ mov x8,x14
+ mov x9,x15
+ mov x10,x16
+ mov x11,x17
+ add x0,sp,#224
+ bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2);
+
+ add x2,sp,#288
+ add x0,sp,#0
+ bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
+
+ add x2,sp,#256
+ bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
+
+ add x2,sp,#96
+ ldr x3,[x22,#32] // forward load for p256_mul_mont
+ ldp x4,x5,[sp,#256]
+ ldp x6,x7,[sp,#256+16]
+ add x0,sp,#32
+ bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
+
+ add x2,x22,#32
+ add x0,sp,#128
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
+
+ ldr x3,[sp,#192]
+ ldp x4,x5,[sp,#32]
+ ldp x6,x7,[sp,#32+16]
+ add x2,sp,#192
+ add x0,sp,#32
+ bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
+
+ add x2,sp,#128
+ bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
+
+ ldp x4,x5,[sp,#0] // res
+ ldp x6,x7,[sp,#0+16]
+ ldp x8,x9,[x23] // in2
+ ldp x10,x11,[x23,#16]
+ ldp x14,x15,[x22,#0] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#0+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+0+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+0+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#0+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#0+48]
+ stp x14,x15,[x21,#0]
+ stp x16,x17,[x21,#0+16]
+ adrp x23,Lone_mont-64
+ add x23,x23,:lo12:Lone_mont-64
+ ldp x14,x15,[x22,#32] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#32+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ ldp x4,x5,[sp,#0+32+32] // res
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ ldp x6,x7,[sp,#0+32+48]
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ ldp x8,x9,[x23,#32+32] // in2
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ ldp x10,x11,[x23,#32+48]
+ stp x14,x15,[x21,#32]
+ stp x16,x17,[x21,#32+16]
+ ldp x14,x15,[x22,#64] // in1
+ cmp x24,#0 // ~, remember?
+ ldp x16,x17,[x22,#64+16]
+ csel x8,x4,x8,ne
+ csel x9,x5,x9,ne
+ csel x10,x6,x10,ne
+ csel x11,x7,x11,ne
+ cmp x25,#0 // ~, remember?
+ csel x14,x8,x14,ne
+ csel x15,x9,x15,ne
+ csel x16,x10,x16,ne
+ csel x17,x11,x17,ne
+ stp x14,x15,[x21,#64]
+ stp x16,x17,[x21,#64+16]
+
+ add sp,x29,#0 // destroy frame
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x29,x30,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t b[4]);
+.globl ecp_nistz256_ord_mul_mont
+
+.def ecp_nistz256_ord_mul_mont
+ .type 32
+.endef
+.align 4
+ecp_nistz256_ord_mul_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adrp x23,Lord
+ add x23,x23,:lo12:Lord
+ ldr x3,[x2] // bp[0]
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+
+ ldp x12,x13,[x23,#0]
+ ldp x21,x22,[x23,#16]
+ ldr x23,[x23,#32]
+
+ mul x14,x4,x3 // a[0]*b[0]
+ umulh x8,x4,x3
+
+ mul x15,x5,x3 // a[1]*b[0]
+ umulh x9,x5,x3
+
+ mul x16,x6,x3 // a[2]*b[0]
+ umulh x10,x6,x3
+
+ mul x17,x7,x3 // a[3]*b[0]
+ umulh x19,x7,x3
+
+ mul x24,x14,x23
+
+ adds x15,x15,x8 // accumulate high parts of multiplication
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adc x19,x19,xzr
+ mov x20,xzr
+ ldr x3,[x2,#8*1] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ ldr x3,[x2,#8*2] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ ldr x3,[x2,#8*3] // b[i]
+
+ lsl x8,x24,#32
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ mul x8,x4,x3
+ adc x11,x11,xzr
+ mul x9,x5,x3
+
+ adds x14,x15,x10
+ mul x10,x6,x3
+ adcs x15,x16,x11
+ mul x11,x7,x3
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ adds x14,x14,x8 // accumulate low parts
+ umulh x8,x4,x3
+ adcs x15,x15,x9
+ umulh x9,x5,x3
+ adcs x16,x16,x10
+ umulh x10,x6,x3
+ adcs x17,x17,x11
+ umulh x11,x7,x3
+ adc x19,x19,xzr
+ mul x24,x14,x23
+ adds x15,x15,x8 // accumulate high parts
+ adcs x16,x16,x9
+ adcs x17,x17,x10
+ adcs x19,x19,x11
+ adc x20,xzr,xzr
+ lsl x8,x24,#32 // last reduction
+ subs x16,x16,x24
+ lsr x9,x24,#32
+ sbcs x17,x17,x8
+ sbcs x19,x19,x9
+ sbc x20,x20,xzr
+
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adcs x17,x19,x24
+ adc x19,x20,xzr
+
+ subs x8,x14,x12 // ret -= modulus
+ sbcs x9,x15,x13
+ sbcs x10,x16,x21
+ sbcs x11,x17,x22
+ sbcs xzr,x19,xzr
+
+ csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x15,x15,x9,lo
+ csel x16,x16,x10,lo
+ stp x14,x15,[x0]
+ csel x17,x17,x11,lo
+ stp x16,x17,[x0,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
+// uint64_t rep);
+.globl ecp_nistz256_ord_sqr_mont
+
+.def ecp_nistz256_ord_sqr_mont
+ .type 32
+.endef
+.align 4
+ecp_nistz256_ord_sqr_mont:
+ AARCH64_VALID_CALL_TARGET
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ adrp x23,Lord
+ add x23,x23,:lo12:Lord
+ ldp x4,x5,[x1]
+ ldp x6,x7,[x1,#16]
+
+ ldp x12,x13,[x23,#0]
+ ldp x21,x22,[x23,#16]
+ ldr x23,[x23,#32]
+ b Loop_ord_sqr
+
+.align 4
+Loop_ord_sqr:
+ sub x2,x2,#1
+ ////////////////////////////////////////////////////////////////
+ // | | | | | |a1*a0| |
+ // | | | | |a2*a0| | |
+ // | |a3*a2|a3*a0| | | |
+ // | | | |a2*a1| | | |
+ // | | |a3*a1| | | | |
+ // *| | | | | | | | 2|
+ // +|a3*a3|a2*a2|a1*a1|a0*a0|
+ // |--+--+--+--+--+--+--+--|
+ // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
+ //
+ // "can't overflow" below mark carrying into high part of
+ // multiplication result, which can't overflow, because it
+ // can never be all ones.
+
+ mul x15,x5,x4 // a[1]*a[0]
+ umulh x9,x5,x4
+ mul x16,x6,x4 // a[2]*a[0]
+ umulh x10,x6,x4
+ mul x17,x7,x4 // a[3]*a[0]
+ umulh x19,x7,x4
+
+ adds x16,x16,x9 // accumulate high parts of multiplication
+ mul x8,x6,x5 // a[2]*a[1]
+ umulh x9,x6,x5
+ adcs x17,x17,x10
+ mul x10,x7,x5 // a[3]*a[1]
+ umulh x11,x7,x5
+ adc x19,x19,xzr // can't overflow
+
+ mul x20,x7,x6 // a[3]*a[2]
+ umulh x1,x7,x6
+
+ adds x9,x9,x10 // accumulate high parts of multiplication
+ mul x14,x4,x4 // a[0]*a[0]
+ adc x10,x11,xzr // can't overflow
+
+ adds x17,x17,x8 // accumulate low parts of multiplication
+ umulh x4,x4,x4
+ adcs x19,x19,x9
+ mul x9,x5,x5 // a[1]*a[1]
+ adcs x20,x20,x10
+ umulh x5,x5,x5
+ adc x1,x1,xzr // can't overflow
+
+ adds x15,x15,x15 // acc[1-6]*=2
+ mul x10,x6,x6 // a[2]*a[2]
+ adcs x16,x16,x16
+ umulh x6,x6,x6
+ adcs x17,x17,x17
+ mul x11,x7,x7 // a[3]*a[3]
+ adcs x19,x19,x19
+ umulh x7,x7,x7
+ adcs x20,x20,x20
+ adcs x1,x1,x1
+ adc x3,xzr,xzr
+
+ adds x15,x15,x4 // +a[i]*a[i]
+ mul x24,x14,x23
+ adcs x16,x16,x9
+ adcs x17,x17,x5
+ adcs x19,x19,x10
+ adcs x20,x20,x6
+ adcs x1,x1,x11
+ adc x3,x3,x7
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adc x17,xzr,x24 // can't overflow
+ mul x11,x14,x23
+ lsl x8,x24,#32
+ subs x15,x15,x24
+ lsr x9,x24,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x11
+ mul x10,x13,x11
+ umulh x24,x13,x11
+
+ adcs x10,x10,x9
+ adc x24,x24,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x24
+ adcs x16,x17,x11
+ adc x17,xzr,x11 // can't overflow
+ mul x24,x14,x23
+ lsl x8,x11,#32
+ subs x15,x15,x11
+ lsr x9,x11,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x24
+ mul x10,x13,x24
+ umulh x11,x13,x24
+
+ adcs x10,x10,x9
+ adc x11,x11,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x11
+ adcs x16,x17,x24
+ adc x17,xzr,x24 // can't overflow
+ mul x11,x14,x23
+ lsl x8,x24,#32
+ subs x15,x15,x24
+ lsr x9,x24,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ subs xzr,x14,#1
+ umulh x9,x12,x11
+ mul x10,x13,x11
+ umulh x24,x13,x11
+
+ adcs x10,x10,x9
+ adc x24,x24,xzr
+
+ adds x14,x15,x10
+ adcs x15,x16,x24
+ adcs x16,x17,x11
+ adc x17,xzr,x11 // can't overflow
+ lsl x8,x11,#32
+ subs x15,x15,x11
+ lsr x9,x11,#32
+ sbcs x16,x16,x8
+ sbc x17,x17,x9 // can't borrow
+ adds x14,x14,x19 // accumulate upper half
+ adcs x15,x15,x20
+ adcs x16,x16,x1
+ adcs x17,x17,x3
+ adc x19,xzr,xzr
+
+ subs x8,x14,x12 // ret -= modulus
+ sbcs x9,x15,x13
+ sbcs x10,x16,x21
+ sbcs x11,x17,x22
+ sbcs xzr,x19,xzr
+
+ csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus
+ csel x5,x15,x9,lo
+ csel x6,x16,x10,lo
+ csel x7,x17,x11,lo
+
+ cbnz x2,Loop_ord_sqr
+
+ stp x4,x5,[x0]
+ stp x6,x7,[x0,#16]
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldr x29,[sp],#64
+ ret
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w5
+
+.def ecp_nistz256_select_w5
+ .type 32
+.endef
+.align 4
+ecp_nistz256_select_w5:
+ AARCH64_VALID_CALL_TARGET
+
+ // x10 := x0
+ // w9 := 0; loop counter and incremented internal index
+ mov x10, x0
+ mov w9, #0
+
+ // [v16-v21] := 0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+
+Lselect_w5_loop:
+ // Loop 16 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add w9, w9, #1
+
+ // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
+ // and advance x1 to point to the next entry
+ ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+ // x11 := (w9 == w2)? All 1s : All 0s
+ cmp w9, w2
+ csetm x11, eq
+
+ // continue loading ...
+ ld1 {v26.2d, v27.2d}, [x1],#32
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup v3.2d, x11
+
+ // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+ // i.e., values in output registers will remain the same if w9 != w2
+ bit v16.16b, v22.16b, v3.16b
+ bit v17.16b, v23.16b, v3.16b
+
+ bit v18.16b, v24.16b, v3.16b
+ bit v19.16b, v25.16b, v3.16b
+
+ bit v20.16b, v26.16b, v3.16b
+ bit v21.16b, v27.16b, v3.16b
+
+ // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
+ tbz w9, #4, Lselect_w5_loop
+
+ // Write [v16-v21] to memory at the output pointer
+ st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
+ st1 {v20.2d, v21.2d}, [x10]
+
+ ret
+
+
+
+////////////////////////////////////////////////////////////////////////
+// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_select_w7
+
+.def ecp_nistz256_select_w7
+ .type 32
+.endef
+.align 4
+ecp_nistz256_select_w7:
+ AARCH64_VALID_CALL_TARGET
+
+ // w9 := 0; loop counter and incremented internal index
+ mov w9, #0
+
+ // [v16-v21] := 0
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+
+Lselect_w7_loop:
+ // Loop 64 times.
+
+ // Increment index (loop counter); tested at the end of the loop
+ add w9, w9, #1
+
+ // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
+ // and advance x1 to point to the next entry
+ ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
+
+ // x11 := (w9 == w2)? All 1s : All 0s
+ cmp w9, w2
+ csetm x11, eq
+
+ // duplicate mask_64 into Mask (all 0s or all 1s)
+ dup v3.2d, x11
+
+ // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
+ // i.e., values in output registers will remain the same if w9 != w2
+ bit v16.16b, v22.16b, v3.16b
+ bit v17.16b, v23.16b, v3.16b
+
+ bit v18.16b, v24.16b, v3.16b
+ bit v19.16b, v25.16b, v3.16b
+
+ // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
+ tbz w9, #6, Lselect_w7_loop
+
+ // Write [v16-v19] to memory at the output pointer
+ st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
+
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/p256-x86_64-asm-apple.S b/gen/bcm/p256-x86_64-asm-apple.S
new file mode 100644
index 0000000..81cb582
--- /dev/null
+++ b/gen/bcm/p256-x86_64-asm-apple.S
@@ -0,0 +1,4473 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+
+.section __DATA,__const
+.p2align 6
+L$poly:
+.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+L$One:
+.long 1,1,1,1,1,1,1,1
+L$Two:
+.long 2,2,2,2,2,2,2,2
+L$Three:
+.long 3,3,3,3,3,3,3,3
+L$ONE_mont:
+.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+L$ord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+L$ordK:
+.quad 0xccd1c8aaee00bc4f
+.text
+
+
+
+.globl _ecp_nistz256_neg
+.private_extern _ecp_nistz256_neg
+
+.p2align 5
+_ecp_nistz256_neg:
+
+_CET_ENDBR
+ pushq %r12
+
+ pushq %r13
+
+L$neg_body:
+
+ xorq %r8,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r13,%r13
+
+ subq 0(%rsi),%r8
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r8,%rax
+ sbbq 24(%rsi),%r11
+ leaq L$poly(%rip),%rsi
+ movq %r9,%rdx
+ sbbq $0,%r13
+
+ addq 0(%rsi),%r8
+ movq %r10,%rcx
+ adcq 8(%rsi),%r9
+ adcq 16(%rsi),%r10
+ movq %r11,%r12
+ adcq 24(%rsi),%r11
+ testq %r13,%r13
+
+ cmovzq %rax,%r8
+ cmovzq %rdx,%r9
+ movq %r8,0(%rdi)
+ cmovzq %rcx,%r10
+ movq %r9,8(%rdi)
+ cmovzq %r12,%r11
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+ movq 0(%rsp),%r13
+
+ movq 8(%rsp),%r12
+
+ leaq 16(%rsp),%rsp
+
+L$neg_epilogue:
+ ret
+
+
+
+
+
+
+
+
+.globl _ecp_nistz256_ord_mul_mont
+.private_extern _ecp_nistz256_ord_mul_mont
+
+.p2align 5
+_ecp_nistz256_ord_mul_mont:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je L$ecp_nistz256_ord_mul_montx
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_mul_body:
+
+ movq 0(%rdx),%rax
+ movq %rdx,%rbx
+ leaq L$ord(%rip),%r14
+ movq L$ordK(%rip),%r15
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ movq %rax,%r8
+ movq %rcx,%rax
+ movq %rdx,%r9
+
+ mulq 8(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq 16(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r8,%r13
+ imulq %r15,%r8
+
+ movq %rdx,%r11
+ mulq 24(%rsi)
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq 0(%r14)
+ movq %r8,%rbp
+ addq %rax,%r13
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%r8
+
+ mulq 8(%r14)
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq %rdx,%r10
+ movq %rbp,%rdx
+ adcq $0,%r8
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 8(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r8,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r9,%rcx
+ imulq %r15,%r9
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ xorq %r8,%r8
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+
+
+ mulq 0(%r14)
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq %r9,%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%r9
+
+ mulq 8(%r14)
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq %rdx,%r11
+ movq %rbp,%rdx
+ adcq $0,%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r12
+ movq 16(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r9,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r10,%rcx
+ imulq %r15,%r10
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ xorq %r9,%r9
+ addq %rax,%r13
+ movq %r10,%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+
+ mulq 0(%r14)
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq %r10,%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r12
+ sbbq $0,%r10
+
+ mulq 8(%r14)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq %rdx,%r12
+ movq %rbp,%rdx
+ adcq $0,%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r13
+ movq 24(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r10,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r11,%rcx
+ imulq %r15,%r11
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r8
+ adcq $0,%rdx
+ xorq %r10,%r10
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ adcq $0,%r10
+
+
+ mulq 0(%r14)
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq %r11,%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r13
+ sbbq $0,%r11
+
+ mulq 8(%r14)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq %rdx,%r13
+ movq %rbp,%rdx
+ adcq $0,%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ sbbq %rdx,%rbp
+
+ addq %r11,%r8
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+ movq %r12,%rsi
+ subq 0(%r14),%r12
+ movq %r13,%r11
+ sbbq 8(%r14),%r13
+ movq %r8,%rcx
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rsi,%r12
+ cmovcq %r11,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_mul_epilogue:
+ ret
+
+
+
+
+
+
+
+
+
+.globl _ecp_nistz256_ord_sqr_mont
+.private_extern _ecp_nistz256_ord_sqr_mont
+
+.p2align 5
+_ecp_nistz256_ord_sqr_mont:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je L$ecp_nistz256_ord_sqr_montx
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_sqr_body:
+
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%rax
+ movq 16(%rsi),%r14
+ movq 24(%rsi),%r15
+ leaq L$ord(%rip),%rsi
+ movq %rdx,%rbx
+ jmp L$oop_ord_sqr
+
+.p2align 5
+L$oop_ord_sqr:
+
+ movq %rax,%rbp
+ mulq %r8
+ movq %rax,%r9
+.byte 102,72,15,110,205
+ movq %r14,%rax
+ movq %rdx,%r10
+
+ mulq %r8
+ addq %rax,%r10
+ movq %r15,%rax
+.byte 102,73,15,110,214
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r8
+ addq %rax,%r11
+ movq %r15,%rax
+.byte 102,73,15,110,223
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq %r14
+ movq %rax,%r13
+ movq %r14,%rax
+ movq %rdx,%r14
+
+
+ mulq %rbp
+ addq %rax,%r11
+ movq %r15,%rax
+ adcq $0,%rdx
+ movq %rdx,%r15
+
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+
+ addq %r15,%r12
+ adcq %rdx,%r13
+ adcq $0,%r14
+
+
+ xorq %r15,%r15
+ movq %r8,%rax
+ addq %r9,%r9
+ adcq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq $0,%r15
+
+
+ mulq %rax
+ movq %rax,%r8
+.byte 102,72,15,126,200
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r9
+ adcq %rax,%r10
+.byte 102,72,15,126,208
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r11
+ adcq %rax,%r12
+.byte 102,72,15,126,216
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ movq %r8,%rcx
+ imulq 32(%rsi),%r8
+
+ mulq %rax
+ addq %rbp,%r13
+ adcq %rax,%r14
+ movq 0(%rsi),%rax
+ adcq %rdx,%r15
+
+
+ mulq %r8
+ movq %r8,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%rbp
+
+ mulq %r8
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %r8,%rax
+ adcq %rdx,%r10
+ movq %r8,%rdx
+ adcq $0,%rbp
+
+ movq %r9,%rcx
+ imulq 32(%rsi),%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r8
+
+ addq %rbp,%r11
+ adcq $0,%r8
+
+
+ mulq %r9
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%rbp
+
+ mulq %r9
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r9,%rdx
+ adcq $0,%rbp
+
+ movq %r10,%rcx
+ imulq 32(%rsi),%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r9
+
+ addq %rbp,%r8
+ adcq $0,%r9
+
+
+ mulq %r10
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r8
+ sbbq $0,%rbp
+
+ mulq %r10
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %r10,%rax
+ adcq %rdx,%r8
+ movq %r10,%rdx
+ adcq $0,%rbp
+
+ movq %r11,%rcx
+ imulq 32(%rsi),%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r9
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r10
+
+ addq %rbp,%r9
+ adcq $0,%r10
+
+
+ mulq %r11
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r9
+ sbbq $0,%rbp
+
+ mulq %r11
+ addq %rcx,%r8
+ adcq $0,%rdx
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ movq %r11,%rdx
+ adcq $0,%rbp
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r10
+ sbbq %rdx,%r11
+
+ addq %rbp,%r10
+ adcq $0,%r11
+
+
+ xorq %rdx,%rdx
+ addq %r12,%r8
+ adcq %r13,%r9
+ movq %r8,%r12
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
+
+
+ subq 0(%rsi),%r8
+ movq %r10,%r14
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r15
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rdx
+
+ cmovcq %r12,%r8
+ cmovncq %r9,%rax
+ cmovncq %r10,%r14
+ cmovncq %r11,%r15
+
+ decq %rbx
+ jnz L$oop_ord_sqr
+
+ movq %r8,0(%rdi)
+ movq %rax,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r14,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r15,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_sqr_epilogue:
+ ret
+
+
+
+
+.p2align 5
+ecp_nistz256_ord_mul_montx:
+
+L$ecp_nistz256_ord_mul_montx:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_mulx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+ leaq L$ord-128(%rip),%r14
+ movq L$ordK(%rip),%r15
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ mulxq %r11,%rbp,%r11
+ addq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ mulxq %r15,%rdx,%rax
+ adcq %rbp,%r10
+ adcq %rcx,%r11
+ adcq $0,%r12
+
+
+ xorq %r13,%r13
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+ adcxq %r8,%r12
+ adoxq %r8,%r13
+ adcq $0,%r13
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcxq %r9,%r13
+ adoxq %r9,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+ adcxq %r10,%r8
+ adoxq %r10,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ leaq 128(%r14),%r14
+ movq %r12,%rbx
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ movq %r13,%rdx
+ adcxq %r11,%r9
+ adoxq %r11,%r10
+ adcq $0,%r10
+
+
+
+ movq %r8,%rcx
+ subq 0(%r14),%r12
+ sbbq 8(%r14),%r13
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_mulx_epilogue:
+ ret
+
+
+
+
+.p2align 5
+ecp_nistz256_ord_sqr_montx:
+
+L$ecp_nistz256_ord_sqr_montx:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$ord_sqrx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq L$ord(%rip),%rsi
+ jmp L$oop_ord_sqrx
+
+.p2align 5
+L$oop_ord_sqrx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ movq %rdx,%rax
+.byte 102,73,15,110,206
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ addq %rcx,%r10
+.byte 102,73,15,110,215
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+ mulxq %r8,%rcx,%r14
+ movq %rax,%rdx
+.byte 102,73,15,110,216
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+
+ mulxq %rdx,%r8,%rbp
+.byte 102,72,15,126,202
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+.byte 102,72,15,126,210
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+ mulxq %rdx,%rcx,%rbp
+.byte 0x67
+.byte 102,72,15,126,218
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ adoxq %rbp,%r13
+ mulxq %rdx,%rcx,%rax
+ adoxq %rcx,%r14
+ adoxq %rax,%r15
+
+
+ movq %r8,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ xorq %rax,%rax
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ adcxq %rax,%r8
+
+
+ movq %r9,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ adoxq %rax,%r9
+
+
+ movq %r10,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ adcxq %rax,%r10
+
+
+ movq %r11,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ adoxq %rax,%r11
+
+
+ addq %r8,%r12
+ adcq %r13,%r9
+ movq %r12,%rdx
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%r14
+ adcq $0,%rax
+
+
+ subq 0(%rsi),%r12
+ movq %r10,%r15
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r8
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rax
+
+ cmovncq %r12,%rdx
+ cmovncq %r9,%r14
+ cmovncq %r10,%r15
+ cmovncq %r11,%r8
+
+ decq %rbx
+ jnz L$oop_ord_sqrx
+
+ movq %rdx,0(%rdi)
+ movq %r14,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r15,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r8,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$ord_sqrx_epilogue:
+ ret
+
+
+
+
+
+
+
+
+.globl _ecp_nistz256_mul_mont
+.private_extern _ecp_nistz256_mul_mont
+
+.p2align 5
+_ecp_nistz256_mul_mont:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+L$mul_mont:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$mul_body:
+ cmpl $0x80100,%ecx
+ je L$mul_montx
+ movq %rdx,%rbx
+ movq 0(%rdx),%rax
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+
+ call __ecp_nistz256_mul_montq
+ jmp L$mul_mont_done
+
+.p2align 5
+L$mul_montx:
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_mul_montx
+L$mul_mont_done:
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$mul_epilogue:
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_mul_montq:
+
+
+
+ movq %rax,%rbp
+ mulq %r9
+ movq L$poly+8(%rip),%r14
+ movq %rax,%r8
+ movq %rbp,%rax
+ movq %rdx,%r9
+
+ mulq %r10
+ movq L$poly+24(%rip),%r15
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %r11
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r12
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ xorq %r13,%r13
+ movq %rdx,%r12
+
+
+
+
+
+
+
+
+
+
+ movq %r8,%rbp
+ shlq $32,%r8
+ mulq %r15
+ shrq $32,%rbp
+ addq %r8,%r9
+ adcq %rbp,%r10
+ adcq %rax,%r11
+ movq 8(%rbx),%rax
+ adcq %rdx,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+
+
+
+ movq %r9,%rbp
+ shlq $32,%r9
+ mulq %r15
+ shrq $32,%rbp
+ addq %r9,%r10
+ adcq %rbp,%r11
+ adcq %rax,%r12
+ movq 16(%rbx),%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %r10,%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+
+
+ movq %r10,%rbp
+ shlq $32,%r10
+ mulq %r15
+ shrq $32,%rbp
+ addq %r10,%r11
+ adcq %rbp,%r12
+ adcq %rax,%r13
+ movq 24(%rbx),%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r8
+ adcq $0,%rdx
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ adcq $0,%r10
+
+
+
+ movq %r11,%rbp
+ shlq $32,%r11
+ mulq %r15
+ shrq $32,%rbp
+ addq %r11,%r12
+ adcq %rbp,%r13
+ movq %r12,%rcx
+ adcq %rax,%r8
+ adcq %rdx,%r9
+ movq %r13,%rbp
+ adcq $0,%r10
+
+
+
+ subq $-1,%r12
+ movq %r8,%rbx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rdx
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rcx,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rbx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rdx,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+
+
+
+
+
+
+
+
+.globl _ecp_nistz256_sqr_mont
+.private_extern _ecp_nistz256_sqr_mont
+
+.p2align 5
+_ecp_nistz256_sqr_mont:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$sqr_body:
+ cmpl $0x80100,%ecx
+ je L$sqr_montx
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+
+ call __ecp_nistz256_sqr_montq
+ jmp L$sqr_mont_done
+
+.p2align 5
+L$sqr_montx:
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_sqr_montx
+L$sqr_mont_done:
+ movq 0(%rsp),%r15
+
+ movq 8(%rsp),%r14
+
+ movq 16(%rsp),%r13
+
+ movq 24(%rsp),%r12
+
+ movq 32(%rsp),%rbx
+
+ movq 40(%rsp),%rbp
+
+ leaq 48(%rsp),%rsp
+
+L$sqr_epilogue:
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_sqr_montq:
+
+ movq %rax,%r13
+ mulq %r14
+ movq %rax,%r9
+ movq %r15,%rax
+ movq %rdx,%r10
+
+ mulq %r13
+ addq %rax,%r10
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r13
+ addq %rax,%r11
+ movq %r15,%rax
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq %r14
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq %r14
+ addq %rax,%r12
+ movq %r8,%rax
+ adcq $0,%rdx
+ addq %rbp,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+
+ mulq %r15
+ xorq %r15,%r15
+ addq %rax,%r13
+ movq 0(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ addq %r9,%r9
+ adcq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq $0,%r15
+
+ mulq %rax
+ movq %rax,%r8
+ movq 8(%rsi),%rax
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r9
+ adcq %rax,%r10
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r11
+ adcq %rax,%r12
+ movq 24(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r13
+ adcq %rax,%r14
+ movq %r8,%rax
+ adcq %rdx,%r15
+
+ movq L$poly+8(%rip),%rsi
+ movq L$poly+24(%rip),%rbp
+
+
+
+
+ movq %r8,%rcx
+ shlq $32,%r8
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r8,%r9
+ adcq %rcx,%r10
+ adcq %rax,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r9,%rcx
+ shlq $32,%r9
+ movq %rdx,%r8
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r9,%r10
+ adcq %rcx,%r11
+ adcq %rax,%r8
+ movq %r10,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r10,%rcx
+ shlq $32,%r10
+ movq %rdx,%r9
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r10,%r11
+ adcq %rcx,%r8
+ adcq %rax,%r9
+ movq %r11,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r11,%rcx
+ shlq $32,%r11
+ movq %rdx,%r10
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r11,%r8
+ adcq %rcx,%r9
+ adcq %rax,%r10
+ adcq $0,%rdx
+ xorq %r11,%r11
+
+
+
+ addq %r8,%r12
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %rdx,%r15
+ movq %r13,%r9
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%rcx
+ sbbq %rbp,%r15
+ sbbq $0,%r11
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %rcx,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ ret
+
+
+
+.p2align 5
+__ecp_nistz256_mul_montx:
+
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ movq $32,%r14
+ xorq %r13,%r13
+ mulxq %r11,%rbp,%r11
+ movq L$poly+24(%rip),%r15
+ adcq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ adcq %rbp,%r10
+ shlxq %r14,%r8,%rbp
+ adcq %rcx,%r11
+ shrxq %r14,%r8,%rcx
+ adcq $0,%r12
+
+
+
+ addq %rbp,%r9
+ adcq %rcx,%r10
+
+ mulxq %r15,%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcq %rcx,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ adcxq %rcx,%r12
+ shlxq %r14,%r9,%rcx
+ adoxq %rbp,%r13
+ shrxq %r14,%r9,%rbp
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+
+ addq %rcx,%r10
+ adcq %rbp,%r11
+
+ mulxq %r15,%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcq %rcx,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ adcxq %rcx,%r13
+ shlxq %r14,%r10,%rcx
+ adoxq %rbp,%r8
+ shrxq %r14,%r10,%rbp
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+
+ addq %rcx,%r11
+ adcq %rbp,%r12
+
+ mulxq %r15,%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcq %rcx,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ adcxq %rcx,%r8
+ shlxq %r14,%r11,%rcx
+ adoxq %rbp,%r9
+ shrxq %r14,%r11,%rbp
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+
+ addq %rcx,%r12
+ adcq %rbp,%r13
+
+ mulxq %r15,%rcx,%rbp
+ movq %r12,%rbx
+ movq L$poly+8(%rip),%r14
+ adcq %rcx,%r8
+ movq %r13,%rdx
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+
+ xorl %eax,%eax
+ movq %r8,%rcx
+ sbbq $-1,%r12
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rbp
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rbp,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_sqr_montx:
+
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ xorl %eax,%eax
+ adcq %rcx,%r10
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+
+ mulxq %r8,%rcx,%r14
+ movq 0+128(%rsi),%rdx
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+ mulxq %rdx,%r8,%rbp
+ movq 8+128(%rsi),%rdx
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+ movq 16+128(%rsi),%rdx
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+.byte 0x67
+ mulxq %rdx,%rcx,%rbp
+ movq 24+128(%rsi),%rdx
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ movq $32,%rsi
+ adoxq %rbp,%r13
+.byte 0x67,0x67
+ mulxq %rdx,%rcx,%rax
+ movq L$poly+24(%rip),%rdx
+ adoxq %rcx,%r14
+ shlxq %rsi,%r8,%rcx
+ adoxq %rax,%r15
+ shrxq %rsi,%r8,%rax
+ movq %rdx,%rbp
+
+
+ addq %rcx,%r9
+ adcq %rax,%r10
+
+ mulxq %r8,%rcx,%r8
+ adcq %rcx,%r11
+ shlxq %rsi,%r9,%rcx
+ adcq $0,%r8
+ shrxq %rsi,%r9,%rax
+
+
+ addq %rcx,%r10
+ adcq %rax,%r11
+
+ mulxq %r9,%rcx,%r9
+ adcq %rcx,%r8
+ shlxq %rsi,%r10,%rcx
+ adcq $0,%r9
+ shrxq %rsi,%r10,%rax
+
+
+ addq %rcx,%r11
+ adcq %rax,%r8
+
+ mulxq %r10,%rcx,%r10
+ adcq %rcx,%r9
+ shlxq %rsi,%r11,%rcx
+ adcq $0,%r10
+ shrxq %rsi,%r11,%rax
+
+
+ addq %rcx,%r8
+ adcq %rax,%r9
+
+ mulxq %r11,%rcx,%r11
+ adcq %rcx,%r10
+ adcq $0,%r11
+
+ xorq %rdx,%rdx
+ addq %r8,%r12
+ movq L$poly+8(%rip),%rsi
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %r11,%r15
+ movq %r13,%r9
+ adcq $0,%rdx
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%r11
+ sbbq %rbp,%r15
+ sbbq $0,%rdx
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %r11,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ ret
+
+
+
+
+.globl _ecp_nistz256_select_w5
+.private_extern _ecp_nistz256_select_w5
+
+.p2align 5
+_ecp_nistz256_select_w5:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rax
+ movq 8(%rax),%rax
+ testl $32,%eax
+ jnz L$avx2_select_w5
+ movdqa L$One(%rip),%xmm0
+ movd %edx,%xmm1
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+
+ movdqa %xmm0,%xmm8
+ pshufd $0,%xmm1,%xmm1
+
+ movq $16,%rax
+L$select_loop_sse_w5:
+
+ movdqa %xmm8,%xmm15
+ paddd %xmm0,%xmm8
+ pcmpeqd %xmm1,%xmm15
+
+ movdqa 0(%rsi),%xmm9
+ movdqa 16(%rsi),%xmm10
+ movdqa 32(%rsi),%xmm11
+ movdqa 48(%rsi),%xmm12
+ movdqa 64(%rsi),%xmm13
+ movdqa 80(%rsi),%xmm14
+ leaq 96(%rsi),%rsi
+
+ pand %xmm15,%xmm9
+ pand %xmm15,%xmm10
+ por %xmm9,%xmm2
+ pand %xmm15,%xmm11
+ por %xmm10,%xmm3
+ pand %xmm15,%xmm12
+ por %xmm11,%xmm4
+ pand %xmm15,%xmm13
+ por %xmm12,%xmm5
+ pand %xmm15,%xmm14
+ por %xmm13,%xmm6
+ por %xmm14,%xmm7
+
+ decq %rax
+ jnz L$select_loop_sse_w5
+
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ movdqu %xmm4,32(%rdi)
+ movdqu %xmm5,48(%rdi)
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm7,80(%rdi)
+ ret
+
+L$SEH_end_ecp_nistz256_select_w5:
+
+
+
+
+.globl _ecp_nistz256_select_w7
+.private_extern _ecp_nistz256_select_w7
+
+.p2align 5
+_ecp_nistz256_select_w7:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rax
+ movq 8(%rax),%rax
+ testl $32,%eax
+ jnz L$avx2_select_w7
+ movdqa L$One(%rip),%xmm8
+ movd %edx,%xmm1
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+
+ movdqa %xmm8,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq $64,%rax
+
+L$select_loop_sse_w7:
+ movdqa %xmm8,%xmm15
+ paddd %xmm0,%xmm8
+ movdqa 0(%rsi),%xmm9
+ movdqa 16(%rsi),%xmm10
+ pcmpeqd %xmm1,%xmm15
+ movdqa 32(%rsi),%xmm11
+ movdqa 48(%rsi),%xmm12
+ leaq 64(%rsi),%rsi
+
+ pand %xmm15,%xmm9
+ pand %xmm15,%xmm10
+ por %xmm9,%xmm2
+ pand %xmm15,%xmm11
+ por %xmm10,%xmm3
+ pand %xmm15,%xmm12
+ por %xmm11,%xmm4
+ prefetcht0 255(%rsi)
+ por %xmm12,%xmm5
+
+ decq %rax
+ jnz L$select_loop_sse_w7
+
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ movdqu %xmm4,32(%rdi)
+ movdqu %xmm5,48(%rdi)
+ ret
+
+L$SEH_end_ecp_nistz256_select_w7:
+
+
+
+
+.p2align 5
+ecp_nistz256_avx2_select_w5:
+
+L$avx2_select_w5:
+ vzeroupper
+ vmovdqa L$Two(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+ vpxor %ymm4,%ymm4,%ymm4
+
+ vmovdqa L$One(%rip),%ymm5
+ vmovdqa L$Two(%rip),%ymm10
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+ movq $8,%rax
+L$select_loop_avx2_w5:
+
+ vmovdqa 0(%rsi),%ymm6
+ vmovdqa 32(%rsi),%ymm7
+ vmovdqa 64(%rsi),%ymm8
+
+ vmovdqa 96(%rsi),%ymm11
+ vmovdqa 128(%rsi),%ymm12
+ vmovdqa 160(%rsi),%ymm13
+
+ vpcmpeqd %ymm1,%ymm5,%ymm9
+ vpcmpeqd %ymm1,%ymm10,%ymm14
+
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm10,%ymm10
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm9,%ymm6,%ymm6
+ vpand %ymm9,%ymm7,%ymm7
+ vpand %ymm9,%ymm8,%ymm8
+ vpand %ymm14,%ymm11,%ymm11
+ vpand %ymm14,%ymm12,%ymm12
+ vpand %ymm14,%ymm13,%ymm13
+
+ vpxor %ymm6,%ymm2,%ymm2
+ vpxor %ymm7,%ymm3,%ymm3
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm11,%ymm2,%ymm2
+ vpxor %ymm12,%ymm3,%ymm3
+ vpxor %ymm13,%ymm4,%ymm4
+
+ decq %rax
+ jnz L$select_loop_avx2_w5
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vzeroupper
+ ret
+
+L$SEH_end_ecp_nistz256_avx2_select_w5:
+
+
+
+
+.globl _ecp_nistz256_avx2_select_w7
+.private_extern _ecp_nistz256_avx2_select_w7
+
+.p2align 5
+_ecp_nistz256_avx2_select_w7:
+
+L$avx2_select_w7:
+_CET_ENDBR
+ vzeroupper
+ vmovdqa L$Three(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+
+ vmovdqa L$One(%rip),%ymm4
+ vmovdqa L$Two(%rip),%ymm8
+ vmovdqa L$Three(%rip),%ymm12
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+
+ movq $21,%rax
+L$select_loop_avx2_w7:
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vmovdqa 64(%rsi),%ymm9
+ vmovdqa 96(%rsi),%ymm10
+
+ vmovdqa 128(%rsi),%ymm13
+ vmovdqa 160(%rsi),%ymm14
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+ vpcmpeqd %ymm1,%ymm8,%ymm11
+ vpcmpeqd %ymm1,%ymm12,%ymm15
+
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpaddd %ymm0,%ymm12,%ymm12
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm11,%ymm9,%ymm9
+ vpand %ymm11,%ymm10,%ymm10
+ vpand %ymm15,%ymm13,%ymm13
+ vpand %ymm15,%ymm14,%ymm14
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+ vpxor %ymm9,%ymm2,%ymm2
+ vpxor %ymm10,%ymm3,%ymm3
+ vpxor %ymm13,%ymm2,%ymm2
+ vpxor %ymm14,%ymm3,%ymm3
+
+ decq %rax
+ jnz L$select_loop_avx2_w7
+
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vzeroupper
+ ret
+
+L$SEH_end_ecp_nistz256_avx2_select_w7:
+
+
+.p2align 5
+__ecp_nistz256_add_toq:
+
+ xorq %r11,%r11
+ addq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_sub_fromq:
+
+ subq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq %r11,%r11
+
+ addq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+ testq %r11,%r11
+
+ cmovzq %rax,%r12
+ cmovzq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovzq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovzq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_subq:
+
+ subq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq %r11,%r11
+
+ addq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+ testq %r11,%r11
+
+ cmovnzq %rax,%r12
+ cmovnzq %rbp,%r13
+ cmovnzq %rcx,%r8
+ cmovnzq %r10,%r9
+
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_mul_by_2q:
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+.globl _ecp_nistz256_point_double
+.private_extern _ecp_nistz256_point_double
+
+.p2align 5
+_ecp_nistz256_point_double:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je L$point_doublex
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $160+8,%rsp
+
+L$point_doubleq_body:
+
+L$point_double_shortcutq:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq L$poly+8(%rip),%r14
+ movq L$poly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-0(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 32(%rbx),%rax
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-0(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montq
+ call __ecp_nistz256_mul_by_2q
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_toq
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montq
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rax
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_toq
+
+ movq 96(%rsp),%rax
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ movq 0+32(%rsp),%rax
+ movq 8+32(%rsp),%r14
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montq
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subq
+
+ movq 32(%rsp),%rax
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-0(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromq
+
+ leaq 160+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_doubleq_epilogue:
+ ret
+
+
+.globl _ecp_nistz256_point_add
+.private_extern _ecp_nistz256_point_add
+
+.p2align 5
+_ecp_nistz256_point_add:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je L$point_addx
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $576+8,%rsp
+
+L$point_addq_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-0(%rsi),%rsi
+ movq %rax,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rax
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-0(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 544(%rsp),%rax
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 416(%rsp),%rax
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq 0+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 512(%rsp),%rax
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq 0+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 480(%rsp),%rax
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+ orq %r8,%r12
+.byte 0x3e
+ jnz L$add_proceedq
+
+
+
+ testq %r9,%r9
+ jz L$add_doubleq
+
+
+
+
+
+
+.byte 102,72,15,126,199
+ pxor %xmm0,%xmm0
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movdqu %xmm0,48(%rdi)
+ movdqu %xmm0,64(%rdi)
+ movdqu %xmm0,80(%rdi)
+ jmp L$add_doneq
+
+.p2align 5
+L$add_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+
+ jmp L$point_double_shortcutq
+
+
+.p2align 5
+L$add_proceedq:
+ movq 0+64(%rsp),%rax
+ movq 8+64(%rsp),%r14
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 544(%rsp),%rax
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq 0+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 0(%rsp),%rax
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 160(%rsp),%rax
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subq
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subq
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rax
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq 0+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 320(%rsp),%rax
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+L$add_doneq:
+ leaq 576+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_addq_epilogue:
+ ret
+
+
+.globl _ecp_nistz256_point_add_affine
+.private_extern _ecp_nistz256_point_add_affine
+
+.p2align 5
+_ecp_nistz256_point_add_affine:
+
+_CET_ENDBR
+ leaq _OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je L$point_add_affinex
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $480+8,%rsp
+
+L$add_affineq_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-0(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rax
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-0(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+64(%rsp),%rax
+ movq 8+64(%rsp),%r14
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 0+96(%rsp),%rax
+ movq 8+96(%rsp),%r14
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 128(%rsp),%rax
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 320(%rsp),%rax
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq 0+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subq
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subq
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rax
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq 0+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 96(%rsp),%rax
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand L$ONE_mont(%rip),%xmm2
+ pand L$ONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$add_affineq_epilogue:
+ ret
+
+
+
+.p2align 5
+__ecp_nistz256_add_tox:
+
+ xorq %r11,%r11
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_sub_fromx:
+
+ xorq %r11,%r11
+ sbbq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq $0,%r11
+
+ xorq %r10,%r10
+ adcq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+
+ btq $0,%r11
+ cmovncq %rax,%r12
+ cmovncq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovncq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovncq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_subx:
+
+ xorq %r11,%r11
+ sbbq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq $0,%r11
+
+ xorq %r9,%r9
+ adcq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+
+ btq $0,%r11
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ cmovcq %rcx,%r8
+ cmovcq %r10,%r9
+
+ ret
+
+
+
+
+.p2align 5
+__ecp_nistz256_mul_by_2x:
+
+ xorq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+
+
+
+.p2align 5
+ecp_nistz256_point_doublex:
+
+L$point_doublex:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $160+8,%rsp
+
+L$point_doublex_body:
+
+L$point_double_shortcutx:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq L$poly+8(%rip),%r14
+ movq L$poly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-128(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 32(%rbx),%rdx
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_by_2x
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montx
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rdx
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 0+32(%rsp),%rdx
+ movq 8+32(%rsp),%r14
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montx
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subx
+
+ movq 32(%rsp),%rdx
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-128(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromx
+
+ leaq 160+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_doublex_epilogue:
+ ret
+
+
+
+.p2align 5
+ecp_nistz256_point_addx:
+
+L$point_addx:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $576+8,%rsp
+
+L$point_addx_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-128(%rsi),%rsi
+ movq %rdx,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rdx
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 416(%rsp),%rdx
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 512(%rsp),%rdx
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq -128+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 480(%rsp),%rdx
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+ orq %r8,%r12
+.byte 0x3e
+ jnz L$add_proceedx
+
+
+
+ testq %r9,%r9
+ jz L$add_doublex
+
+
+
+
+
+
+.byte 102,72,15,126,199
+ pxor %xmm0,%xmm0
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movdqu %xmm0,48(%rdi)
+ movdqu %xmm0,64(%rdi)
+ movdqu %xmm0,80(%rdi)
+ jmp L$add_donex
+
+.p2align 5
+L$add_doublex:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+
+ jmp L$point_double_shortcutx
+
+
+.p2align 5
+L$add_proceedx:
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq -128+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0(%rsp),%rdx
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 160(%rsp),%rdx
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+L$add_donex:
+ leaq 576+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$point_addx_epilogue:
+ ret
+
+
+
+.p2align 5
+ecp_nistz256_point_add_affinex:
+
+L$point_add_affinex:
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $480+8,%rsp
+
+L$add_affinex_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-128(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rdx
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-128(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+96(%rsp),%rdx
+ movq 8+96(%rsp),%r14
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq -128+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rdx
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq -128+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand L$ONE_mont(%rip),%xmm2
+ pand L$ONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbx
+
+ movq -8(%rsi),%rbp
+
+ leaq (%rsi),%rsp
+
+L$add_affinex_epilogue:
+ ret
+
+
+#endif
diff --git a/gen/bcm/p256-x86_64-asm-linux.S b/gen/bcm/p256-x86_64-asm-linux.S
new file mode 100644
index 0000000..b285543
--- /dev/null
+++ b/gen/bcm/p256-x86_64-asm-linux.S
@@ -0,0 +1,4548 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+
+.section .rodata
+.align 64
+.Lpoly:
+.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+.LOne:
+.long 1,1,1,1,1,1,1,1
+.LTwo:
+.long 2,2,2,2,2,2,2,2
+.LThree:
+.long 3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
+.text
+
+
+
+.globl ecp_nistz256_neg
+.hidden ecp_nistz256_neg
+.type ecp_nistz256_neg,@function
+.align 32
+ecp_nistz256_neg:
+.cfi_startproc
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+.Lneg_body:
+
+ xorq %r8,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r13,%r13
+
+ subq 0(%rsi),%r8
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r8,%rax
+ sbbq 24(%rsi),%r11
+ leaq .Lpoly(%rip),%rsi
+ movq %r9,%rdx
+ sbbq $0,%r13
+
+ addq 0(%rsi),%r8
+ movq %r10,%rcx
+ adcq 8(%rsi),%r9
+ adcq 16(%rsi),%r10
+ movq %r11,%r12
+ adcq 24(%rsi),%r11
+ testq %r13,%r13
+
+ cmovzq %rax,%r8
+ cmovzq %rdx,%r9
+ movq %r8,0(%rdi)
+ cmovzq %rcx,%r10
+ movq %r9,8(%rdi)
+ cmovzq %r12,%r11
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+ movq 0(%rsp),%r13
+.cfi_restore %r13
+ movq 8(%rsp),%r12
+.cfi_restore %r12
+ leaq 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lneg_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_neg,.-ecp_nistz256_neg
+
+
+
+
+
+
+.globl ecp_nistz256_ord_mul_mont
+.hidden ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,@function
+.align 32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_mul_montx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_mul_body:
+
+ movq 0(%rdx),%rax
+ movq %rdx,%rbx
+ leaq .Lord(%rip),%r14
+ movq .LordK(%rip),%r15
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ movq %rax,%r8
+ movq %rcx,%rax
+ movq %rdx,%r9
+
+ mulq 8(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq 16(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r8,%r13
+ imulq %r15,%r8
+
+ movq %rdx,%r11
+ mulq 24(%rsi)
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq 0(%r14)
+ movq %r8,%rbp
+ addq %rax,%r13
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%r8
+
+ mulq 8(%r14)
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq %rdx,%r10
+ movq %rbp,%rdx
+ adcq $0,%r8
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 8(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r8,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r9,%rcx
+ imulq %r15,%r9
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ xorq %r8,%r8
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+
+
+ mulq 0(%r14)
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq %r9,%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%r9
+
+ mulq 8(%r14)
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq %rdx,%r11
+ movq %rbp,%rdx
+ adcq $0,%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r12
+ movq 16(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r9,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r10,%rcx
+ imulq %r15,%r10
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ xorq %r9,%r9
+ addq %rax,%r13
+ movq %r10,%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+
+ mulq 0(%r14)
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq %r10,%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r12
+ sbbq $0,%r10
+
+ mulq 8(%r14)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq %rdx,%r12
+ movq %rbp,%rdx
+ adcq $0,%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r13
+ movq 24(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r10,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r11,%rcx
+ imulq %r15,%r11
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r8
+ adcq $0,%rdx
+ xorq %r10,%r10
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ adcq $0,%r10
+
+
+ mulq 0(%r14)
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq %r11,%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r13
+ sbbq $0,%r11
+
+ mulq 8(%r14)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq %rdx,%r13
+ movq %rbp,%rdx
+ adcq $0,%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ sbbq %rdx,%rbp
+
+ addq %r11,%r8
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+ movq %r12,%rsi
+ subq 0(%r14),%r12
+ movq %r13,%r11
+ sbbq 8(%r14),%r13
+ movq %r8,%rcx
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rsi,%r12
+ cmovcq %r11,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mul_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+
+
+
+
+
+
+.globl ecp_nistz256_ord_sqr_mont
+.hidden ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,@function
+.align 32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_sqr_montx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_sqr_body:
+
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%rax
+ movq 16(%rsi),%r14
+ movq 24(%rsi),%r15
+ leaq .Lord(%rip),%rsi
+ movq %rdx,%rbx
+ jmp .Loop_ord_sqr
+
+.align 32
+.Loop_ord_sqr:
+
+ movq %rax,%rbp
+ mulq %r8
+ movq %rax,%r9
+.byte 102,72,15,110,205
+ movq %r14,%rax
+ movq %rdx,%r10
+
+ mulq %r8
+ addq %rax,%r10
+ movq %r15,%rax
+.byte 102,73,15,110,214
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r8
+ addq %rax,%r11
+ movq %r15,%rax
+.byte 102,73,15,110,223
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq %r14
+ movq %rax,%r13
+ movq %r14,%rax
+ movq %rdx,%r14
+
+
+ mulq %rbp
+ addq %rax,%r11
+ movq %r15,%rax
+ adcq $0,%rdx
+ movq %rdx,%r15
+
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+
+ addq %r15,%r12
+ adcq %rdx,%r13
+ adcq $0,%r14
+
+
+ xorq %r15,%r15
+ movq %r8,%rax
+ addq %r9,%r9
+ adcq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq $0,%r15
+
+
+ mulq %rax
+ movq %rax,%r8
+.byte 102,72,15,126,200
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r9
+ adcq %rax,%r10
+.byte 102,72,15,126,208
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r11
+ adcq %rax,%r12
+.byte 102,72,15,126,216
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ movq %r8,%rcx
+ imulq 32(%rsi),%r8
+
+ mulq %rax
+ addq %rbp,%r13
+ adcq %rax,%r14
+ movq 0(%rsi),%rax
+ adcq %rdx,%r15
+
+
+ mulq %r8
+ movq %r8,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%rbp
+
+ mulq %r8
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %r8,%rax
+ adcq %rdx,%r10
+ movq %r8,%rdx
+ adcq $0,%rbp
+
+ movq %r9,%rcx
+ imulq 32(%rsi),%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r8
+
+ addq %rbp,%r11
+ adcq $0,%r8
+
+
+ mulq %r9
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%rbp
+
+ mulq %r9
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r9,%rdx
+ adcq $0,%rbp
+
+ movq %r10,%rcx
+ imulq 32(%rsi),%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r9
+
+ addq %rbp,%r8
+ adcq $0,%r9
+
+
+ mulq %r10
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r8
+ sbbq $0,%rbp
+
+ mulq %r10
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %r10,%rax
+ adcq %rdx,%r8
+ movq %r10,%rdx
+ adcq $0,%rbp
+
+ movq %r11,%rcx
+ imulq 32(%rsi),%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r9
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r10
+
+ addq %rbp,%r9
+ adcq $0,%r10
+
+
+ mulq %r11
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r9
+ sbbq $0,%rbp
+
+ mulq %r11
+ addq %rcx,%r8
+ adcq $0,%rdx
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ movq %r11,%rdx
+ adcq $0,%rbp
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r10
+ sbbq %rdx,%r11
+
+ addq %rbp,%r10
+ adcq $0,%r11
+
+
+ xorq %rdx,%rdx
+ addq %r12,%r8
+ adcq %r13,%r9
+ movq %r8,%r12
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
+
+
+ subq 0(%rsi),%r8
+ movq %r10,%r14
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r15
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rdx
+
+ cmovcq %r12,%r8
+ cmovncq %r9,%rax
+ cmovncq %r10,%r14
+ cmovncq %r11,%r15
+
+ decq %rbx
+ jnz .Loop_ord_sqr
+
+ movq %r8,0(%rdi)
+ movq %rax,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r14,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r15,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqr_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+
+.type ecp_nistz256_ord_mul_montx,@function
+.align 32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_mulx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+ leaq .Lord-128(%rip),%r14
+ movq .LordK(%rip),%r15
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ mulxq %r11,%rbp,%r11
+ addq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ mulxq %r15,%rdx,%rax
+ adcq %rbp,%r10
+ adcq %rcx,%r11
+ adcq $0,%r12
+
+
+ xorq %r13,%r13
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+ adcxq %r8,%r12
+ adoxq %r8,%r13
+ adcq $0,%r13
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcxq %r9,%r13
+ adoxq %r9,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+ adcxq %r10,%r8
+ adoxq %r10,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ leaq 128(%r14),%r14
+ movq %r12,%rbx
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ movq %r13,%rdx
+ adcxq %r11,%r9
+ adoxq %r11,%r10
+ adcq $0,%r10
+
+
+
+ movq %r8,%rcx
+ subq 0(%r14),%r12
+ sbbq 8(%r14),%r13
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mulx_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,@function
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_sqrx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq .Lord(%rip),%rsi
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ movq %rdx,%rax
+.byte 102,73,15,110,206
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ addq %rcx,%r10
+.byte 102,73,15,110,215
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+ mulxq %r8,%rcx,%r14
+ movq %rax,%rdx
+.byte 102,73,15,110,216
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+
+ mulxq %rdx,%r8,%rbp
+.byte 102,72,15,126,202
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+.byte 102,72,15,126,210
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+ mulxq %rdx,%rcx,%rbp
+.byte 0x67
+.byte 102,72,15,126,218
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ adoxq %rbp,%r13
+ mulxq %rdx,%rcx,%rax
+ adoxq %rcx,%r14
+ adoxq %rax,%r15
+
+
+ movq %r8,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ xorq %rax,%rax
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ adcxq %rax,%r8
+
+
+ movq %r9,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ adoxq %rax,%r9
+
+
+ movq %r10,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ adcxq %rax,%r10
+
+
+ movq %r11,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ adoxq %rax,%r11
+
+
+ addq %r8,%r12
+ adcq %r13,%r9
+ movq %r12,%rdx
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%r14
+ adcq $0,%rax
+
+
+ subq 0(%rsi),%r12
+ movq %r10,%r15
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r8
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rax
+
+ cmovncq %r12,%rdx
+ cmovncq %r9,%r14
+ cmovncq %r10,%r15
+ cmovncq %r11,%r8
+
+ decq %rbx
+ jnz .Loop_ord_sqrx
+
+ movq %rdx,0(%rdi)
+ movq %r14,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r15,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r8,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqrx_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+
+
+
+
+
+
+.globl ecp_nistz256_mul_mont
+.hidden ecp_nistz256_mul_mont
+.type ecp_nistz256_mul_mont,@function
+.align 32
+ecp_nistz256_mul_mont:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+.Lmul_mont:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lmul_body:
+ cmpl $0x80100,%ecx
+ je .Lmul_montx
+ movq %rdx,%rbx
+ movq 0(%rdx),%rax
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+
+ call __ecp_nistz256_mul_montq
+ jmp .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_mul_montx
+.Lmul_mont_done:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lmul_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+.type __ecp_nistz256_mul_montq,@function
+.align 32
+__ecp_nistz256_mul_montq:
+.cfi_startproc
+
+
+ movq %rax,%rbp
+ mulq %r9
+ movq .Lpoly+8(%rip),%r14
+ movq %rax,%r8
+ movq %rbp,%rax
+ movq %rdx,%r9
+
+ mulq %r10
+ movq .Lpoly+24(%rip),%r15
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %r11
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r12
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ xorq %r13,%r13
+ movq %rdx,%r12
+
+
+
+
+
+
+
+
+
+
+ movq %r8,%rbp
+ shlq $32,%r8
+ mulq %r15
+ shrq $32,%rbp
+ addq %r8,%r9
+ adcq %rbp,%r10
+ adcq %rax,%r11
+ movq 8(%rbx),%rax
+ adcq %rdx,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+
+
+
+ movq %r9,%rbp
+ shlq $32,%r9
+ mulq %r15
+ shrq $32,%rbp
+ addq %r9,%r10
+ adcq %rbp,%r11
+ adcq %rax,%r12
+ movq 16(%rbx),%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %r10,%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+
+
+ movq %r10,%rbp
+ shlq $32,%r10
+ mulq %r15
+ shrq $32,%rbp
+ addq %r10,%r11
+ adcq %rbp,%r12
+ adcq %rax,%r13
+ movq 24(%rbx),%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r8
+ adcq $0,%rdx
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ adcq $0,%r10
+
+
+
+ movq %r11,%rbp
+ shlq $32,%r11
+ mulq %r15
+ shrq $32,%rbp
+ addq %r11,%r12
+ adcq %rbp,%r13
+ movq %r12,%rcx
+ adcq %rax,%r8
+ adcq %rdx,%r9
+ movq %r13,%rbp
+ adcq $0,%r10
+
+
+
+ subq $-1,%r12
+ movq %r8,%rbx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rdx
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rcx,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rbx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rdx,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+
+
+
+
+
+
+
+.globl ecp_nistz256_sqr_mont
+.hidden ecp_nistz256_sqr_mont
+.type ecp_nistz256_sqr_mont,@function
+.align 32
+ecp_nistz256_sqr_mont:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lsqr_body:
+ cmpl $0x80100,%ecx
+ je .Lsqr_montx
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+
+ call __ecp_nistz256_sqr_montq
+ jmp .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_sqr_montx
+.Lsqr_mont_done:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lsqr_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+.type __ecp_nistz256_sqr_montq,@function
+.align 32
+__ecp_nistz256_sqr_montq:
+.cfi_startproc
+ movq %rax,%r13
+ mulq %r14
+ movq %rax,%r9
+ movq %r15,%rax
+ movq %rdx,%r10
+
+ mulq %r13
+ addq %rax,%r10
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r13
+ addq %rax,%r11
+ movq %r15,%rax
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq %r14
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq %r14
+ addq %rax,%r12
+ movq %r8,%rax
+ adcq $0,%rdx
+ addq %rbp,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+
+ mulq %r15
+ xorq %r15,%r15
+ addq %rax,%r13
+ movq 0(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ addq %r9,%r9
+ adcq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq $0,%r15
+
+ mulq %rax
+ movq %rax,%r8
+ movq 8(%rsi),%rax
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r9
+ adcq %rax,%r10
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r11
+ adcq %rax,%r12
+ movq 24(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r13
+ adcq %rax,%r14
+ movq %r8,%rax
+ adcq %rdx,%r15
+
+ movq .Lpoly+8(%rip),%rsi
+ movq .Lpoly+24(%rip),%rbp
+
+
+
+
+ movq %r8,%rcx
+ shlq $32,%r8
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r8,%r9
+ adcq %rcx,%r10
+ adcq %rax,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r9,%rcx
+ shlq $32,%r9
+ movq %rdx,%r8
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r9,%r10
+ adcq %rcx,%r11
+ adcq %rax,%r8
+ movq %r10,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r10,%rcx
+ shlq $32,%r10
+ movq %rdx,%r9
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r10,%r11
+ adcq %rcx,%r8
+ adcq %rax,%r9
+ movq %r11,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r11,%rcx
+ shlq $32,%r11
+ movq %rdx,%r10
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r11,%r8
+ adcq %rcx,%r9
+ adcq %rax,%r10
+ adcq $0,%rdx
+ xorq %r11,%r11
+
+
+
+ addq %r8,%r12
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %rdx,%r15
+ movq %r13,%r9
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%rcx
+ sbbq %rbp,%r15
+ sbbq $0,%r11
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %rcx,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type __ecp_nistz256_mul_montx,@function
+.align 32
+__ecp_nistz256_mul_montx:
+.cfi_startproc
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ movq $32,%r14
+ xorq %r13,%r13
+ mulxq %r11,%rbp,%r11
+ movq .Lpoly+24(%rip),%r15
+ adcq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ adcq %rbp,%r10
+ shlxq %r14,%r8,%rbp
+ adcq %rcx,%r11
+ shrxq %r14,%r8,%rcx
+ adcq $0,%r12
+
+
+
+ addq %rbp,%r9
+ adcq %rcx,%r10
+
+ mulxq %r15,%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcq %rcx,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ adcxq %rcx,%r12
+ shlxq %r14,%r9,%rcx
+ adoxq %rbp,%r13
+ shrxq %r14,%r9,%rbp
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+
+ addq %rcx,%r10
+ adcq %rbp,%r11
+
+ mulxq %r15,%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcq %rcx,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ adcxq %rcx,%r13
+ shlxq %r14,%r10,%rcx
+ adoxq %rbp,%r8
+ shrxq %r14,%r10,%rbp
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+
+ addq %rcx,%r11
+ adcq %rbp,%r12
+
+ mulxq %r15,%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcq %rcx,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ adcxq %rcx,%r8
+ shlxq %r14,%r11,%rcx
+ adoxq %rbp,%r9
+ shrxq %r14,%r11,%rbp
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+
+ addq %rcx,%r12
+ adcq %rbp,%r13
+
+ mulxq %r15,%rcx,%rbp
+ movq %r12,%rbx
+ movq .Lpoly+8(%rip),%r14
+ adcq %rcx,%r8
+ movq %r13,%rdx
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+
+ xorl %eax,%eax
+ movq %r8,%rcx
+ sbbq $-1,%r12
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rbp
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rbp,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type __ecp_nistz256_sqr_montx,@function
+.align 32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ xorl %eax,%eax
+ adcq %rcx,%r10
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+
+ mulxq %r8,%rcx,%r14
+ movq 0+128(%rsi),%rdx
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+ mulxq %rdx,%r8,%rbp
+ movq 8+128(%rsi),%rdx
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+ movq 16+128(%rsi),%rdx
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+.byte 0x67
+ mulxq %rdx,%rcx,%rbp
+ movq 24+128(%rsi),%rdx
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ movq $32,%rsi
+ adoxq %rbp,%r13
+.byte 0x67,0x67
+ mulxq %rdx,%rcx,%rax
+ movq .Lpoly+24(%rip),%rdx
+ adoxq %rcx,%r14
+ shlxq %rsi,%r8,%rcx
+ adoxq %rax,%r15
+ shrxq %rsi,%r8,%rax
+ movq %rdx,%rbp
+
+
+ addq %rcx,%r9
+ adcq %rax,%r10
+
+ mulxq %r8,%rcx,%r8
+ adcq %rcx,%r11
+ shlxq %rsi,%r9,%rcx
+ adcq $0,%r8
+ shrxq %rsi,%r9,%rax
+
+
+ addq %rcx,%r10
+ adcq %rax,%r11
+
+ mulxq %r9,%rcx,%r9
+ adcq %rcx,%r8
+ shlxq %rsi,%r10,%rcx
+ adcq $0,%r9
+ shrxq %rsi,%r10,%rax
+
+
+ addq %rcx,%r11
+ adcq %rax,%r8
+
+ mulxq %r10,%rcx,%r10
+ adcq %rcx,%r9
+ shlxq %rsi,%r11,%rcx
+ adcq $0,%r10
+ shrxq %rsi,%r11,%rax
+
+
+ addq %rcx,%r8
+ adcq %rax,%r9
+
+ mulxq %r11,%rcx,%r11
+ adcq %rcx,%r10
+ adcq $0,%r11
+
+ xorq %rdx,%rdx
+ addq %r8,%r12
+ movq .Lpoly+8(%rip),%rsi
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %r11,%r15
+ movq %r13,%r9
+ adcq $0,%rdx
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%r11
+ sbbq %rbp,%r15
+ sbbq $0,%rdx
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %r11,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+
+
+.globl ecp_nistz256_select_w5
+.hidden ecp_nistz256_select_w5
+.type ecp_nistz256_select_w5,@function
+.align 32
+ecp_nistz256_select_w5:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rax
+ movq 8(%rax),%rax
+ testl $32,%eax
+ jnz .Lavx2_select_w5
+ movdqa .LOne(%rip),%xmm0
+ movd %edx,%xmm1
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+
+ movdqa %xmm0,%xmm8
+ pshufd $0,%xmm1,%xmm1
+
+ movq $16,%rax
+.Lselect_loop_sse_w5:
+
+ movdqa %xmm8,%xmm15
+ paddd %xmm0,%xmm8
+ pcmpeqd %xmm1,%xmm15
+
+ movdqa 0(%rsi),%xmm9
+ movdqa 16(%rsi),%xmm10
+ movdqa 32(%rsi),%xmm11
+ movdqa 48(%rsi),%xmm12
+ movdqa 64(%rsi),%xmm13
+ movdqa 80(%rsi),%xmm14
+ leaq 96(%rsi),%rsi
+
+ pand %xmm15,%xmm9
+ pand %xmm15,%xmm10
+ por %xmm9,%xmm2
+ pand %xmm15,%xmm11
+ por %xmm10,%xmm3
+ pand %xmm15,%xmm12
+ por %xmm11,%xmm4
+ pand %xmm15,%xmm13
+ por %xmm12,%xmm5
+ pand %xmm15,%xmm14
+ por %xmm13,%xmm6
+ por %xmm14,%xmm7
+
+ decq %rax
+ jnz .Lselect_loop_sse_w5
+
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ movdqu %xmm4,32(%rdi)
+ movdqu %xmm5,48(%rdi)
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm7,80(%rdi)
+ ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w5:
+.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+
+.globl ecp_nistz256_select_w7
+.hidden ecp_nistz256_select_w7
+.type ecp_nistz256_select_w7,@function
+.align 32
+ecp_nistz256_select_w7:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rax
+ movq 8(%rax),%rax
+ testl $32,%eax
+ jnz .Lavx2_select_w7
+ movdqa .LOne(%rip),%xmm8
+ movd %edx,%xmm1
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+
+ movdqa %xmm8,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq $64,%rax
+
+.Lselect_loop_sse_w7:
+ movdqa %xmm8,%xmm15
+ paddd %xmm0,%xmm8
+ movdqa 0(%rsi),%xmm9
+ movdqa 16(%rsi),%xmm10
+ pcmpeqd %xmm1,%xmm15
+ movdqa 32(%rsi),%xmm11
+ movdqa 48(%rsi),%xmm12
+ leaq 64(%rsi),%rsi
+
+ pand %xmm15,%xmm9
+ pand %xmm15,%xmm10
+ por %xmm9,%xmm2
+ pand %xmm15,%xmm11
+ por %xmm10,%xmm3
+ pand %xmm15,%xmm12
+ por %xmm11,%xmm4
+ prefetcht0 255(%rsi)
+ por %xmm12,%xmm5
+
+ decq %rax
+ jnz .Lselect_loop_sse_w7
+
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ movdqu %xmm4,32(%rdi)
+ movdqu %xmm5,48(%rdi)
+ ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w7:
+.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+
+
+.type ecp_nistz256_avx2_select_w5,@function
+.align 32
+ecp_nistz256_avx2_select_w5:
+.cfi_startproc
+.Lavx2_select_w5:
+ vzeroupper
+ vmovdqa .LTwo(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+ vpxor %ymm4,%ymm4,%ymm4
+
+ vmovdqa .LOne(%rip),%ymm5
+ vmovdqa .LTwo(%rip),%ymm10
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+ movq $8,%rax
+.Lselect_loop_avx2_w5:
+
+ vmovdqa 0(%rsi),%ymm6
+ vmovdqa 32(%rsi),%ymm7
+ vmovdqa 64(%rsi),%ymm8
+
+ vmovdqa 96(%rsi),%ymm11
+ vmovdqa 128(%rsi),%ymm12
+ vmovdqa 160(%rsi),%ymm13
+
+ vpcmpeqd %ymm1,%ymm5,%ymm9
+ vpcmpeqd %ymm1,%ymm10,%ymm14
+
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm10,%ymm10
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm9,%ymm6,%ymm6
+ vpand %ymm9,%ymm7,%ymm7
+ vpand %ymm9,%ymm8,%ymm8
+ vpand %ymm14,%ymm11,%ymm11
+ vpand %ymm14,%ymm12,%ymm12
+ vpand %ymm14,%ymm13,%ymm13
+
+ vpxor %ymm6,%ymm2,%ymm2
+ vpxor %ymm7,%ymm3,%ymm3
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm11,%ymm2,%ymm2
+ vpxor %ymm12,%ymm3,%ymm3
+ vpxor %ymm13,%ymm4,%ymm4
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w5
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vzeroupper
+ ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_select_w5:
+.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+
+
+
+.globl ecp_nistz256_avx2_select_w7
+.hidden ecp_nistz256_avx2_select_w7
+.type ecp_nistz256_avx2_select_w7,@function
+.align 32
+ecp_nistz256_avx2_select_w7:
+.cfi_startproc
+.Lavx2_select_w7:
+_CET_ENDBR
+ vzeroupper
+ vmovdqa .LThree(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+
+ vmovdqa .LOne(%rip),%ymm4
+ vmovdqa .LTwo(%rip),%ymm8
+ vmovdqa .LThree(%rip),%ymm12
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+
+ movq $21,%rax
+.Lselect_loop_avx2_w7:
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vmovdqa 64(%rsi),%ymm9
+ vmovdqa 96(%rsi),%ymm10
+
+ vmovdqa 128(%rsi),%ymm13
+ vmovdqa 160(%rsi),%ymm14
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+ vpcmpeqd %ymm1,%ymm8,%ymm11
+ vpcmpeqd %ymm1,%ymm12,%ymm15
+
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpaddd %ymm0,%ymm12,%ymm12
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm11,%ymm9,%ymm9
+ vpand %ymm11,%ymm10,%ymm10
+ vpand %ymm15,%ymm13,%ymm13
+ vpand %ymm15,%ymm14,%ymm14
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+ vpxor %ymm9,%ymm2,%ymm2
+ vpxor %ymm10,%ymm3,%ymm3
+ vpxor %ymm13,%ymm2,%ymm2
+ vpxor %ymm14,%ymm3,%ymm3
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w7
+
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vzeroupper
+ ret
+.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_select_w7:
+.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.type __ecp_nistz256_add_toq,@function
+.align 32
+__ecp_nistz256_add_toq:
+.cfi_startproc
+ xorq %r11,%r11
+ addq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type __ecp_nistz256_sub_fromq,@function
+.align 32
+__ecp_nistz256_sub_fromq:
+.cfi_startproc
+ subq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq %r11,%r11
+
+ addq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+ testq %r11,%r11
+
+ cmovzq %rax,%r12
+ cmovzq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovzq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovzq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type __ecp_nistz256_subq,@function
+.align 32
+__ecp_nistz256_subq:
+.cfi_startproc
+ subq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq %r11,%r11
+
+ addq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+ testq %r11,%r11
+
+ cmovnzq %rax,%r12
+ cmovnzq %rbp,%r13
+ cmovnzq %rcx,%r8
+ cmovnzq %r10,%r9
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type __ecp_nistz256_mul_by_2q,@function
+.align 32
+__ecp_nistz256_mul_by_2q:
+.cfi_startproc
+ xorq %r11,%r11
+ addq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+.globl ecp_nistz256_point_double
+.hidden ecp_nistz256_point_double
+.type ecp_nistz256_point_double,@function
+.align 32
+ecp_nistz256_point_double:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_doublex
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $160+8,%rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_doubleq_body:
+
+.Lpoint_double_shortcutq:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq .Lpoly+8(%rip),%r14
+ movq .Lpoly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-0(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 32(%rbx),%rax
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-0(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montq
+ call __ecp_nistz256_mul_by_2q
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_toq
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montq
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rax
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_toq
+
+ movq 96(%rsp),%rax
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ movq 0+32(%rsp),%rax
+ movq 8+32(%rsp),%r14
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montq
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subq
+
+ movq 32(%rsp),%rax
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-0(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromq
+
+ leaq 160+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_doubleq_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl ecp_nistz256_point_add
+.hidden ecp_nistz256_point_add
+.type ecp_nistz256_point_add,@function
+.align 32
+ecp_nistz256_point_add:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_addx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $576+8,%rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_addq_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-0(%rsi),%rsi
+ movq %rax,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rax
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-0(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 544(%rsp),%rax
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 416(%rsp),%rax
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq 0+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 512(%rsp),%rax
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq 0+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 480(%rsp),%rax
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+ orq %r8,%r12
+.byte 0x3e
+ jnz .Ladd_proceedq
+
+
+
+ testq %r9,%r9
+ jz .Ladd_doubleq
+
+
+
+
+
+
+.byte 102,72,15,126,199
+ pxor %xmm0,%xmm0
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movdqu %xmm0,48(%rdi)
+ movdqu %xmm0,64(%rdi)
+ movdqu %xmm0,80(%rdi)
+ jmp .Ladd_doneq
+
+.align 32
+.Ladd_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+.cfi_adjust_cfa_offset -416
+ jmp .Lpoint_double_shortcutq
+.cfi_adjust_cfa_offset 416
+
+.align 32
+.Ladd_proceedq:
+ movq 0+64(%rsp),%rax
+ movq 8+64(%rsp),%r14
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 544(%rsp),%rax
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq 0+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 0(%rsp),%rax
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 160(%rsp),%rax
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subq
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subq
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rax
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq 0+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 320(%rsp),%rax
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+.Ladd_doneq:
+ leaq 576+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_addq_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl ecp_nistz256_point_add_affine
+.hidden ecp_nistz256_point_add_affine
+.type ecp_nistz256_point_add_affine,@function
+.align 32
+ecp_nistz256_point_add_affine:
+.cfi_startproc
+_CET_ENDBR
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_add_affinex
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $480+8,%rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affineq_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-0(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rax
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-0(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+64(%rsp),%rax
+ movq 8+64(%rsp),%r14
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 0+96(%rsp),%rax
+ movq 8+96(%rsp),%r14
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 128(%rsp),%rax
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 320(%rsp),%rax
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq 0+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subq
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subq
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rax
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq 0+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 96(%rsp),%rax
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand .LONE_mont(%rip),%xmm2
+ pand .LONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affineq_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type __ecp_nistz256_add_tox,@function
+.align 32
+__ecp_nistz256_add_tox:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type __ecp_nistz256_sub_fromx,@function
+.align 32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq $0,%r11
+
+ xorq %r10,%r10
+ adcq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+
+ btq $0,%r11
+ cmovncq %rax,%r12
+ cmovncq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovncq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovncq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type __ecp_nistz256_subx,@function
+.align 32
+__ecp_nistz256_subx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq $0,%r11
+
+ xorq %r9,%r9
+ adcq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+
+ btq $0,%r11
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ cmovcq %rcx,%r8
+ cmovcq %r10,%r9
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type __ecp_nistz256_mul_by_2x,@function
+.align 32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ ret
+.cfi_endproc
+.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type ecp_nistz256_point_doublex,@function
+.align 32
+ecp_nistz256_point_doublex:
+.cfi_startproc
+.Lpoint_doublex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $160+8,%rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq .Lpoly+8(%rip),%r14
+ movq .Lpoly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-128(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 32(%rbx),%rdx
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_by_2x
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montx
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rdx
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 0+32(%rsp),%rdx
+ movq 8+32(%rsp),%r14
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montx
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subx
+
+ movq 32(%rsp),%rdx
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-128(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromx
+
+ leaq 160+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_doublex_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type ecp_nistz256_point_addx,@function
+.align 32
+ecp_nistz256_point_addx:
+.cfi_startproc
+.Lpoint_addx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $576+8,%rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_addx_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-128(%rsi),%rsi
+ movq %rdx,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rdx
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 416(%rsp),%rdx
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 512(%rsp),%rdx
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq -128+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 480(%rsp),%rdx
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+ orq %r8,%r12
+.byte 0x3e
+ jnz .Ladd_proceedx
+
+
+
+ testq %r9,%r9
+ jz .Ladd_doublex
+
+
+
+
+
+
+.byte 102,72,15,126,199
+ pxor %xmm0,%xmm0
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movdqu %xmm0,48(%rdi)
+ movdqu %xmm0,64(%rdi)
+ movdqu %xmm0,80(%rdi)
+ jmp .Ladd_donex
+
+.align 32
+.Ladd_doublex:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+.cfi_adjust_cfa_offset -416
+ jmp .Lpoint_double_shortcutx
+.cfi_adjust_cfa_offset 416
+
+.align 32
+.Ladd_proceedx:
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq -128+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0(%rsp),%rdx
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 160(%rsp),%rdx
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+.Ladd_donex:
+ leaq 576+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_addx_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type ecp_nistz256_point_add_affinex,@function
+.align 32
+ecp_nistz256_point_add_affinex:
+.cfi_startproc
+.Lpoint_add_affinex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $480+8,%rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affinex_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-128(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rdx
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-128(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+96(%rsp),%rdx
+ movq 8+96(%rsp),%r14
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq -128+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rdx
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq -128+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand .LONE_mont(%rip),%xmm2
+ pand .LONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affinex_epilogue:
+ ret
+.cfi_endproc
+.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
+#endif
diff --git a/gen/bcm/p256-x86_64-asm-win.asm b/gen/bcm/p256-x86_64-asm-win.asm
new file mode 100644
index 0000000..c25cac3
--- /dev/null
+++ b/gen/bcm/p256-x86_64-asm-win.asm
@@ -0,0 +1,5004 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+EXTERN OPENSSL_ia32cap_P
+
+
+section .rdata rdata align=8
+ALIGN 64
+$L$poly:
+ DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
+
+$L$One:
+ DD 1,1,1,1,1,1,1,1
+$L$Two:
+ DD 2,2,2,2,2,2,2,2
+$L$Three:
+ DD 3,3,3,3,3,3,3,3
+$L$ONE_mont:
+ DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
+
+
+$L$ord:
+ DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
+$L$ordK:
+ DQ 0xccd1c8aaee00bc4f
+section .text
+
+
+
+
+global ecp_nistz256_neg
+
+ALIGN 32
+ecp_nistz256_neg:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_neg:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+_CET_ENDBR
+ push r12
+
+ push r13
+
+$L$neg_body:
+
+ xor r8,r8
+ xor r9,r9
+ xor r10,r10
+ xor r11,r11
+ xor r13,r13
+
+ sub r8,QWORD[rsi]
+ sbb r9,QWORD[8+rsi]
+ sbb r10,QWORD[16+rsi]
+ mov rax,r8
+ sbb r11,QWORD[24+rsi]
+ lea rsi,[$L$poly]
+ mov rdx,r9
+ sbb r13,0
+
+ add r8,QWORD[rsi]
+ mov rcx,r10
+ adc r9,QWORD[8+rsi]
+ adc r10,QWORD[16+rsi]
+ mov r12,r11
+ adc r11,QWORD[24+rsi]
+ test r13,r13
+
+ cmovz r8,rax
+ cmovz r9,rdx
+ mov QWORD[rdi],r8
+ cmovz r10,rcx
+ mov QWORD[8+rdi],r9
+ cmovz r11,r12
+ mov QWORD[16+rdi],r10
+ mov QWORD[24+rdi],r11
+
+ mov r13,QWORD[rsp]
+
+ mov r12,QWORD[8+rsp]
+
+ lea rsp,[16+rsp]
+
+$L$neg_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_neg:
+
+
+
+
+
+
+global ecp_nistz256_ord_mul_mont
+
+ALIGN 32
+ecp_nistz256_ord_mul_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_mul_mont:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rcx,[OPENSSL_ia32cap_P]
+ mov rcx,QWORD[8+rcx]
+ and ecx,0x80100
+ cmp ecx,0x80100
+ je NEAR $L$ecp_nistz256_ord_mul_montx
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$ord_mul_body:
+
+ mov rax,QWORD[rdx]
+ mov rbx,rdx
+ lea r14,[$L$ord]
+ mov r15,QWORD[$L$ordK]
+
+
+ mov rcx,rax
+ mul QWORD[rsi]
+ mov r8,rax
+ mov rax,rcx
+ mov r9,rdx
+
+ mul QWORD[8+rsi]
+ add r9,rax
+ mov rax,rcx
+ adc rdx,0
+ mov r10,rdx
+
+ mul QWORD[16+rsi]
+ add r10,rax
+ mov rax,rcx
+ adc rdx,0
+
+ mov r13,r8
+ imul r8,r15
+
+ mov r11,rdx
+ mul QWORD[24+rsi]
+ add r11,rax
+ mov rax,r8
+ adc rdx,0
+ mov r12,rdx
+
+
+ mul QWORD[r14]
+ mov rbp,r8
+ add r13,rax
+ mov rax,r8
+ adc rdx,0
+ mov rcx,rdx
+
+ sub r10,r8
+ sbb r8,0
+
+ mul QWORD[8+r14]
+ add r9,rcx
+ adc rdx,0
+ add r9,rax
+ mov rax,rbp
+ adc r10,rdx
+ mov rdx,rbp
+ adc r8,0
+
+ shl rax,32
+ shr rdx,32
+ sub r11,rax
+ mov rax,QWORD[8+rbx]
+ sbb rbp,rdx
+
+ add r11,r8
+ adc r12,rbp
+ adc r13,0
+
+
+ mov rcx,rax
+ mul QWORD[rsi]
+ add r9,rax
+ mov rax,rcx
+ adc rdx,0
+ mov rbp,rdx
+
+ mul QWORD[8+rsi]
+ add r10,rbp
+ adc rdx,0
+ add r10,rax
+ mov rax,rcx
+ adc rdx,0
+ mov rbp,rdx
+
+ mul QWORD[16+rsi]
+ add r11,rbp
+ adc rdx,0
+ add r11,rax
+ mov rax,rcx
+ adc rdx,0
+
+ mov rcx,r9
+ imul r9,r15
+
+ mov rbp,rdx
+ mul QWORD[24+rsi]
+ add r12,rbp
+ adc rdx,0
+ xor r8,r8
+ add r12,rax
+ mov rax,r9
+ adc r13,rdx
+ adc r8,0
+
+
+ mul QWORD[r14]
+ mov rbp,r9
+ add rcx,rax
+ mov rax,r9
+ adc rcx,rdx
+
+ sub r11,r9
+ sbb r9,0
+
+ mul QWORD[8+r14]
+ add r10,rcx
+ adc rdx,0
+ add r10,rax
+ mov rax,rbp
+ adc r11,rdx
+ mov rdx,rbp
+ adc r9,0
+
+ shl rax,32
+ shr rdx,32
+ sub r12,rax
+ mov rax,QWORD[16+rbx]
+ sbb rbp,rdx
+
+ add r12,r9
+ adc r13,rbp
+ adc r8,0
+
+
+ mov rcx,rax
+ mul QWORD[rsi]
+ add r10,rax
+ mov rax,rcx
+ adc rdx,0
+ mov rbp,rdx
+
+ mul QWORD[8+rsi]
+ add r11,rbp
+ adc rdx,0
+ add r11,rax
+ mov rax,rcx
+ adc rdx,0
+ mov rbp,rdx
+
+ mul QWORD[16+rsi]
+ add r12,rbp
+ adc rdx,0
+ add r12,rax
+ mov rax,rcx
+ adc rdx,0
+
+ mov rcx,r10
+ imul r10,r15
+
+ mov rbp,rdx
+ mul QWORD[24+rsi]
+ add r13,rbp
+ adc rdx,0
+ xor r9,r9
+ add r13,rax
+ mov rax,r10
+ adc r8,rdx
+ adc r9,0
+
+
+ mul QWORD[r14]
+ mov rbp,r10
+ add rcx,rax
+ mov rax,r10
+ adc rcx,rdx
+
+ sub r12,r10
+ sbb r10,0
+
+ mul QWORD[8+r14]
+ add r11,rcx
+ adc rdx,0
+ add r11,rax
+ mov rax,rbp
+ adc r12,rdx
+ mov rdx,rbp
+ adc r10,0
+
+ shl rax,32
+ shr rdx,32
+ sub r13,rax
+ mov rax,QWORD[24+rbx]
+ sbb rbp,rdx
+
+ add r13,r10
+ adc r8,rbp
+ adc r9,0
+
+
+ mov rcx,rax
+ mul QWORD[rsi]
+ add r11,rax
+ mov rax,rcx
+ adc rdx,0
+ mov rbp,rdx
+
+ mul QWORD[8+rsi]
+ add r12,rbp
+ adc rdx,0
+ add r12,rax
+ mov rax,rcx
+ adc rdx,0
+ mov rbp,rdx
+
+ mul QWORD[16+rsi]
+ add r13,rbp
+ adc rdx,0
+ add r13,rax
+ mov rax,rcx
+ adc rdx,0
+
+ mov rcx,r11
+ imul r11,r15
+
+ mov rbp,rdx
+ mul QWORD[24+rsi]
+ add r8,rbp
+ adc rdx,0
+ xor r10,r10
+ add r8,rax
+ mov rax,r11
+ adc r9,rdx
+ adc r10,0
+
+
+ mul QWORD[r14]
+ mov rbp,r11
+ add rcx,rax
+ mov rax,r11
+ adc rcx,rdx
+
+ sub r13,r11
+ sbb r11,0
+
+ mul QWORD[8+r14]
+ add r12,rcx
+ adc rdx,0
+ add r12,rax
+ mov rax,rbp
+ adc r13,rdx
+ mov rdx,rbp
+ adc r11,0
+
+ shl rax,32
+ shr rdx,32
+ sub r8,rax
+ sbb rbp,rdx
+
+ add r8,r11
+ adc r9,rbp
+ adc r10,0
+
+
+ mov rsi,r12
+ sub r12,QWORD[r14]
+ mov r11,r13
+ sbb r13,QWORD[8+r14]
+ mov rcx,r8
+ sbb r8,QWORD[16+r14]
+ mov rbp,r9
+ sbb r9,QWORD[24+r14]
+ sbb r10,0
+
+ cmovc r12,rsi
+ cmovc r13,r11
+ cmovc r8,rcx
+ cmovc r9,rbp
+
+ mov QWORD[rdi],r12
+ mov QWORD[8+rdi],r13
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ mov r15,QWORD[rsp]
+
+ mov r14,QWORD[8+rsp]
+
+ mov r13,QWORD[16+rsp]
+
+ mov r12,QWORD[24+rsp]
+
+ mov rbx,QWORD[32+rsp]
+
+ mov rbp,QWORD[40+rsp]
+
+ lea rsp,[48+rsp]
+
+$L$ord_mul_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_ord_mul_mont:
+
+
+
+
+
+
+
+global ecp_nistz256_ord_sqr_mont
+
+ALIGN 32
+ecp_nistz256_ord_sqr_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_sqr_mont:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rcx,[OPENSSL_ia32cap_P]
+ mov rcx,QWORD[8+rcx]
+ and ecx,0x80100
+ cmp ecx,0x80100
+ je NEAR $L$ecp_nistz256_ord_sqr_montx
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$ord_sqr_body:
+
+ mov r8,QWORD[rsi]
+ mov rax,QWORD[8+rsi]
+ mov r14,QWORD[16+rsi]
+ mov r15,QWORD[24+rsi]
+ lea rsi,[$L$ord]
+ mov rbx,rdx
+ jmp NEAR $L$oop_ord_sqr
+
+ALIGN 32
+$L$oop_ord_sqr:
+
+ mov rbp,rax
+ mul r8
+ mov r9,rax
+DB 102,72,15,110,205
+ mov rax,r14
+ mov r10,rdx
+
+ mul r8
+ add r10,rax
+ mov rax,r15
+DB 102,73,15,110,214
+ adc rdx,0
+ mov r11,rdx
+
+ mul r8
+ add r11,rax
+ mov rax,r15
+DB 102,73,15,110,223
+ adc rdx,0
+ mov r12,rdx
+
+
+ mul r14
+ mov r13,rax
+ mov rax,r14
+ mov r14,rdx
+
+
+ mul rbp
+ add r11,rax
+ mov rax,r15
+ adc rdx,0
+ mov r15,rdx
+
+ mul rbp
+ add r12,rax
+ adc rdx,0
+
+ add r12,r15
+ adc r13,rdx
+ adc r14,0
+
+
+ xor r15,r15
+ mov rax,r8
+ add r9,r9
+ adc r10,r10
+ adc r11,r11
+ adc r12,r12
+ adc r13,r13
+ adc r14,r14
+ adc r15,0
+
+
+ mul rax
+ mov r8,rax
+DB 102,72,15,126,200
+ mov rbp,rdx
+
+ mul rax
+ add r9,rbp
+ adc r10,rax
+DB 102,72,15,126,208
+ adc rdx,0
+ mov rbp,rdx
+
+ mul rax
+ add r11,rbp
+ adc r12,rax
+DB 102,72,15,126,216
+ adc rdx,0
+ mov rbp,rdx
+
+ mov rcx,r8
+ imul r8,QWORD[32+rsi]
+
+ mul rax
+ add r13,rbp
+ adc r14,rax
+ mov rax,QWORD[rsi]
+ adc r15,rdx
+
+
+ mul r8
+ mov rbp,r8
+ add rcx,rax
+ mov rax,QWORD[8+rsi]
+ adc rcx,rdx
+
+ sub r10,r8
+ sbb rbp,0
+
+ mul r8
+ add r9,rcx
+ adc rdx,0
+ add r9,rax
+ mov rax,r8
+ adc r10,rdx
+ mov rdx,r8
+ adc rbp,0
+
+ mov rcx,r9
+ imul r9,QWORD[32+rsi]
+
+ shl rax,32
+ shr rdx,32
+ sub r11,rax
+ mov rax,QWORD[rsi]
+ sbb r8,rdx
+
+ add r11,rbp
+ adc r8,0
+
+
+ mul r9
+ mov rbp,r9
+ add rcx,rax
+ mov rax,QWORD[8+rsi]
+ adc rcx,rdx
+
+ sub r11,r9
+ sbb rbp,0
+
+ mul r9
+ add r10,rcx
+ adc rdx,0
+ add r10,rax
+ mov rax,r9
+ adc r11,rdx
+ mov rdx,r9
+ adc rbp,0
+
+ mov rcx,r10
+ imul r10,QWORD[32+rsi]
+
+ shl rax,32
+ shr rdx,32
+ sub r8,rax
+ mov rax,QWORD[rsi]
+ sbb r9,rdx
+
+ add r8,rbp
+ adc r9,0
+
+
+ mul r10
+ mov rbp,r10
+ add rcx,rax
+ mov rax,QWORD[8+rsi]
+ adc rcx,rdx
+
+ sub r8,r10
+ sbb rbp,0
+
+ mul r10
+ add r11,rcx
+ adc rdx,0
+ add r11,rax
+ mov rax,r10
+ adc r8,rdx
+ mov rdx,r10
+ adc rbp,0
+
+ mov rcx,r11
+ imul r11,QWORD[32+rsi]
+
+ shl rax,32
+ shr rdx,32
+ sub r9,rax
+ mov rax,QWORD[rsi]
+ sbb r10,rdx
+
+ add r9,rbp
+ adc r10,0
+
+
+ mul r11
+ mov rbp,r11
+ add rcx,rax
+ mov rax,QWORD[8+rsi]
+ adc rcx,rdx
+
+ sub r9,r11
+ sbb rbp,0
+
+ mul r11
+ add r8,rcx
+ adc rdx,0
+ add r8,rax
+ mov rax,r11
+ adc r9,rdx
+ mov rdx,r11
+ adc rbp,0
+
+ shl rax,32
+ shr rdx,32
+ sub r10,rax
+ sbb r11,rdx
+
+ add r10,rbp
+ adc r11,0
+
+
+ xor rdx,rdx
+ add r8,r12
+ adc r9,r13
+ mov r12,r8
+ adc r10,r14
+ adc r11,r15
+ mov rax,r9
+ adc rdx,0
+
+
+ sub r8,QWORD[rsi]
+ mov r14,r10
+ sbb r9,QWORD[8+rsi]
+ sbb r10,QWORD[16+rsi]
+ mov r15,r11
+ sbb r11,QWORD[24+rsi]
+ sbb rdx,0
+
+ cmovc r8,r12
+ cmovnc rax,r9
+ cmovnc r14,r10
+ cmovnc r15,r11
+
+ dec rbx
+ jnz NEAR $L$oop_ord_sqr
+
+ mov QWORD[rdi],r8
+ mov QWORD[8+rdi],rax
+ pxor xmm1,xmm1
+ mov QWORD[16+rdi],r14
+ pxor xmm2,xmm2
+ mov QWORD[24+rdi],r15
+ pxor xmm3,xmm3
+
+ mov r15,QWORD[rsp]
+
+ mov r14,QWORD[8+rsp]
+
+ mov r13,QWORD[16+rsp]
+
+ mov r12,QWORD[24+rsp]
+
+ mov rbx,QWORD[32+rsp]
+
+ mov rbp,QWORD[40+rsp]
+
+ lea rsp,[48+rsp]
+
+$L$ord_sqr_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_ord_sqr_mont:
+
+
+ALIGN 32
+ecp_nistz256_ord_mul_montx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_mul_montx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$ecp_nistz256_ord_mul_montx:
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$ord_mulx_body:
+
+ mov rbx,rdx
+ mov rdx,QWORD[rdx]
+ mov r9,QWORD[rsi]
+ mov r10,QWORD[8+rsi]
+ mov r11,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+ lea rsi,[((-128))+rsi]
+ lea r14,[(($L$ord-128))]
+ mov r15,QWORD[$L$ordK]
+
+
+ mulx r9,r8,r9
+ mulx r10,rcx,r10
+ mulx r11,rbp,r11
+ add r9,rcx
+ mulx r12,rcx,r12
+ mov rdx,r8
+ mulx rax,rdx,r15
+ adc r10,rbp
+ adc r11,rcx
+ adc r12,0
+
+
+ xor r13,r13
+ mulx rbp,rcx,QWORD[((0+128))+r14]
+ adcx r8,rcx
+ adox r9,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+r14]
+ adcx r9,rcx
+ adox r10,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+r14]
+ adcx r10,rcx
+ adox r11,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+r14]
+ mov rdx,QWORD[8+rbx]
+ adcx r11,rcx
+ adox r12,rbp
+ adcx r12,r8
+ adox r13,r8
+ adc r13,0
+
+
+ mulx rbp,rcx,QWORD[((0+128))+rsi]
+ adcx r9,rcx
+ adox r10,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+rsi]
+ adcx r10,rcx
+ adox r11,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+rsi]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+rsi]
+ mov rdx,r9
+ mulx rax,rdx,r15
+ adcx r12,rcx
+ adox r13,rbp
+
+ adcx r13,r8
+ adox r8,r8
+ adc r8,0
+
+
+ mulx rbp,rcx,QWORD[((0+128))+r14]
+ adcx r9,rcx
+ adox r10,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+r14]
+ adcx r10,rcx
+ adox r11,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+r14]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+r14]
+ mov rdx,QWORD[16+rbx]
+ adcx r12,rcx
+ adox r13,rbp
+ adcx r13,r9
+ adox r8,r9
+ adc r8,0
+
+
+ mulx rbp,rcx,QWORD[((0+128))+rsi]
+ adcx r10,rcx
+ adox r11,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+rsi]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+rsi]
+ adcx r12,rcx
+ adox r13,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+rsi]
+ mov rdx,r10
+ mulx rax,rdx,r15
+ adcx r13,rcx
+ adox r8,rbp
+
+ adcx r8,r9
+ adox r9,r9
+ adc r9,0
+
+
+ mulx rbp,rcx,QWORD[((0+128))+r14]
+ adcx r10,rcx
+ adox r11,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+r14]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+r14]
+ adcx r12,rcx
+ adox r13,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+r14]
+ mov rdx,QWORD[24+rbx]
+ adcx r13,rcx
+ adox r8,rbp
+ adcx r8,r10
+ adox r9,r10
+ adc r9,0
+
+
+ mulx rbp,rcx,QWORD[((0+128))+rsi]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+rsi]
+ adcx r12,rcx
+ adox r13,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+rsi]
+ adcx r13,rcx
+ adox r8,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+rsi]
+ mov rdx,r11
+ mulx rax,rdx,r15
+ adcx r8,rcx
+ adox r9,rbp
+
+ adcx r9,r10
+ adox r10,r10
+ adc r10,0
+
+
+ mulx rbp,rcx,QWORD[((0+128))+r14]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+r14]
+ adcx r12,rcx
+ adox r13,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+r14]
+ adcx r13,rcx
+ adox r8,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+r14]
+ lea r14,[128+r14]
+ mov rbx,r12
+ adcx r8,rcx
+ adox r9,rbp
+ mov rdx,r13
+ adcx r9,r11
+ adox r10,r11
+ adc r10,0
+
+
+
+ mov rcx,r8
+ sub r12,QWORD[r14]
+ sbb r13,QWORD[8+r14]
+ sbb r8,QWORD[16+r14]
+ mov rbp,r9
+ sbb r9,QWORD[24+r14]
+ sbb r10,0
+
+ cmovc r12,rbx
+ cmovc r13,rdx
+ cmovc r8,rcx
+ cmovc r9,rbp
+
+ mov QWORD[rdi],r12
+ mov QWORD[8+rdi],r13
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ mov r15,QWORD[rsp]
+
+ mov r14,QWORD[8+rsp]
+
+ mov r13,QWORD[16+rsp]
+
+ mov r12,QWORD[24+rsp]
+
+ mov rbx,QWORD[32+rsp]
+
+ mov rbp,QWORD[40+rsp]
+
+ lea rsp,[48+rsp]
+
+$L$ord_mulx_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_ord_mul_montx:
+
+
+ALIGN 32
+ecp_nistz256_ord_sqr_montx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_ord_sqr_montx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$ecp_nistz256_ord_sqr_montx:
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$ord_sqrx_body:
+
+ mov rbx,rdx
+ mov rdx,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r15,QWORD[16+rsi]
+ mov r8,QWORD[24+rsi]
+ lea rsi,[$L$ord]
+ jmp NEAR $L$oop_ord_sqrx
+
+ALIGN 32
+$L$oop_ord_sqrx:
+ mulx r10,r9,r14
+ mulx r11,rcx,r15
+ mov rax,rdx
+DB 102,73,15,110,206
+ mulx r12,rbp,r8
+ mov rdx,r14
+ add r10,rcx
+DB 102,73,15,110,215
+ adc r11,rbp
+ adc r12,0
+ xor r13,r13
+
+ mulx rbp,rcx,r15
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,r8
+ mov rdx,r15
+ adcx r12,rcx
+ adox r13,rbp
+ adc r13,0
+
+ mulx r14,rcx,r8
+ mov rdx,rax
+DB 102,73,15,110,216
+ xor r15,r15
+ adcx r9,r9
+ adox r13,rcx
+ adcx r10,r10
+ adox r14,r15
+
+
+ mulx rbp,r8,rdx
+DB 102,72,15,126,202
+ adcx r11,r11
+ adox r9,rbp
+ adcx r12,r12
+ mulx rax,rcx,rdx
+DB 102,72,15,126,210
+ adcx r13,r13
+ adox r10,rcx
+ adcx r14,r14
+ mulx rbp,rcx,rdx
+ DB 0x67
+DB 102,72,15,126,218
+ adox r11,rax
+ adcx r15,r15
+ adox r12,rcx
+ adox r13,rbp
+ mulx rax,rcx,rdx
+ adox r14,rcx
+ adox r15,rax
+
+
+ mov rdx,r8
+ mulx rcx,rdx,QWORD[32+rsi]
+
+ xor rax,rax
+ mulx rbp,rcx,QWORD[rsi]
+ adcx r8,rcx
+ adox r9,rbp
+ mulx rbp,rcx,QWORD[8+rsi]
+ adcx r9,rcx
+ adox r10,rbp
+ mulx rbp,rcx,QWORD[16+rsi]
+ adcx r10,rcx
+ adox r11,rbp
+ mulx rbp,rcx,QWORD[24+rsi]
+ adcx r11,rcx
+ adox r8,rbp
+ adcx r8,rax
+
+
+ mov rdx,r9
+ mulx rcx,rdx,QWORD[32+rsi]
+
+ mulx rbp,rcx,QWORD[rsi]
+ adox r9,rcx
+ adcx r10,rbp
+ mulx rbp,rcx,QWORD[8+rsi]
+ adox r10,rcx
+ adcx r11,rbp
+ mulx rbp,rcx,QWORD[16+rsi]
+ adox r11,rcx
+ adcx r8,rbp
+ mulx rbp,rcx,QWORD[24+rsi]
+ adox r8,rcx
+ adcx r9,rbp
+ adox r9,rax
+
+
+ mov rdx,r10
+ mulx rcx,rdx,QWORD[32+rsi]
+
+ mulx rbp,rcx,QWORD[rsi]
+ adcx r10,rcx
+ adox r11,rbp
+ mulx rbp,rcx,QWORD[8+rsi]
+ adcx r11,rcx
+ adox r8,rbp
+ mulx rbp,rcx,QWORD[16+rsi]
+ adcx r8,rcx
+ adox r9,rbp
+ mulx rbp,rcx,QWORD[24+rsi]
+ adcx r9,rcx
+ adox r10,rbp
+ adcx r10,rax
+
+
+ mov rdx,r11
+ mulx rcx,rdx,QWORD[32+rsi]
+
+ mulx rbp,rcx,QWORD[rsi]
+ adox r11,rcx
+ adcx r8,rbp
+ mulx rbp,rcx,QWORD[8+rsi]
+ adox r8,rcx
+ adcx r9,rbp
+ mulx rbp,rcx,QWORD[16+rsi]
+ adox r9,rcx
+ adcx r10,rbp
+ mulx rbp,rcx,QWORD[24+rsi]
+ adox r10,rcx
+ adcx r11,rbp
+ adox r11,rax
+
+
+ add r12,r8
+ adc r9,r13
+ mov rdx,r12
+ adc r10,r14
+ adc r11,r15
+ mov r14,r9
+ adc rax,0
+
+
+ sub r12,QWORD[rsi]
+ mov r15,r10
+ sbb r9,QWORD[8+rsi]
+ sbb r10,QWORD[16+rsi]
+ mov r8,r11
+ sbb r11,QWORD[24+rsi]
+ sbb rax,0
+
+ cmovnc rdx,r12
+ cmovnc r14,r9
+ cmovnc r15,r10
+ cmovnc r8,r11
+
+ dec rbx
+ jnz NEAR $L$oop_ord_sqrx
+
+ mov QWORD[rdi],rdx
+ mov QWORD[8+rdi],r14
+ pxor xmm1,xmm1
+ mov QWORD[16+rdi],r15
+ pxor xmm2,xmm2
+ mov QWORD[24+rdi],r8
+ pxor xmm3,xmm3
+
+ mov r15,QWORD[rsp]
+
+ mov r14,QWORD[8+rsp]
+
+ mov r13,QWORD[16+rsp]
+
+ mov r12,QWORD[24+rsp]
+
+ mov rbx,QWORD[32+rsp]
+
+ mov rbp,QWORD[40+rsp]
+
+ lea rsp,[48+rsp]
+
+$L$ord_sqrx_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_ord_sqr_montx:
+
+
+
+
+
+
+global ecp_nistz256_mul_mont
+
+ALIGN 32
+ecp_nistz256_mul_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_mul_mont:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rcx,[OPENSSL_ia32cap_P]
+ mov rcx,QWORD[8+rcx]
+ and ecx,0x80100
+$L$mul_mont:
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$mul_body:
+ cmp ecx,0x80100
+ je NEAR $L$mul_montx
+ mov rbx,rdx
+ mov rax,QWORD[rdx]
+ mov r9,QWORD[rsi]
+ mov r10,QWORD[8+rsi]
+ mov r11,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+
+ call __ecp_nistz256_mul_montq
+ jmp NEAR $L$mul_mont_done
+
+ALIGN 32
+$L$mul_montx:
+ mov rbx,rdx
+ mov rdx,QWORD[rdx]
+ mov r9,QWORD[rsi]
+ mov r10,QWORD[8+rsi]
+ mov r11,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+ lea rsi,[((-128))+rsi]
+
+ call __ecp_nistz256_mul_montx
+$L$mul_mont_done:
+ mov r15,QWORD[rsp]
+
+ mov r14,QWORD[8+rsp]
+
+ mov r13,QWORD[16+rsp]
+
+ mov r12,QWORD[24+rsp]
+
+ mov rbx,QWORD[32+rsp]
+
+ mov rbp,QWORD[40+rsp]
+
+ lea rsp,[48+rsp]
+
+$L$mul_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_mul_mont:
+
+
+ALIGN 32
+__ecp_nistz256_mul_montq:
+
+
+
+ mov rbp,rax
+ mul r9
+ mov r14,QWORD[(($L$poly+8))]
+ mov r8,rax
+ mov rax,rbp
+ mov r9,rdx
+
+ mul r10
+ mov r15,QWORD[(($L$poly+24))]
+ add r9,rax
+ mov rax,rbp
+ adc rdx,0
+ mov r10,rdx
+
+ mul r11
+ add r10,rax
+ mov rax,rbp
+ adc rdx,0
+ mov r11,rdx
+
+ mul r12
+ add r11,rax
+ mov rax,r8
+ adc rdx,0
+ xor r13,r13
+ mov r12,rdx
+
+
+
+
+
+
+
+
+
+
+ mov rbp,r8
+ shl r8,32
+ mul r15
+ shr rbp,32
+ add r9,r8
+ adc r10,rbp
+ adc r11,rax
+ mov rax,QWORD[8+rbx]
+ adc r12,rdx
+ adc r13,0
+ xor r8,r8
+
+
+
+ mov rbp,rax
+ mul QWORD[rsi]
+ add r9,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[8+rsi]
+ add r10,rcx
+ adc rdx,0
+ add r10,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[16+rsi]
+ add r11,rcx
+ adc rdx,0
+ add r11,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[24+rsi]
+ add r12,rcx
+ adc rdx,0
+ add r12,rax
+ mov rax,r9
+ adc r13,rdx
+ adc r8,0
+
+
+
+ mov rbp,r9
+ shl r9,32
+ mul r15
+ shr rbp,32
+ add r10,r9
+ adc r11,rbp
+ adc r12,rax
+ mov rax,QWORD[16+rbx]
+ adc r13,rdx
+ adc r8,0
+ xor r9,r9
+
+
+
+ mov rbp,rax
+ mul QWORD[rsi]
+ add r10,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[8+rsi]
+ add r11,rcx
+ adc rdx,0
+ add r11,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[16+rsi]
+ add r12,rcx
+ adc rdx,0
+ add r12,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[24+rsi]
+ add r13,rcx
+ adc rdx,0
+ add r13,rax
+ mov rax,r10
+ adc r8,rdx
+ adc r9,0
+
+
+
+ mov rbp,r10
+ shl r10,32
+ mul r15
+ shr rbp,32
+ add r11,r10
+ adc r12,rbp
+ adc r13,rax
+ mov rax,QWORD[24+rbx]
+ adc r8,rdx
+ adc r9,0
+ xor r10,r10
+
+
+
+ mov rbp,rax
+ mul QWORD[rsi]
+ add r11,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[8+rsi]
+ add r12,rcx
+ adc rdx,0
+ add r12,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[16+rsi]
+ add r13,rcx
+ adc rdx,0
+ add r13,rax
+ mov rax,rbp
+ adc rdx,0
+ mov rcx,rdx
+
+ mul QWORD[24+rsi]
+ add r8,rcx
+ adc rdx,0
+ add r8,rax
+ mov rax,r11
+ adc r9,rdx
+ adc r10,0
+
+
+
+ mov rbp,r11
+ shl r11,32
+ mul r15
+ shr rbp,32
+ add r12,r11
+ adc r13,rbp
+ mov rcx,r12
+ adc r8,rax
+ adc r9,rdx
+ mov rbp,r13
+ adc r10,0
+
+
+
+ sub r12,-1
+ mov rbx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov rdx,r9
+ sbb r9,r15
+ sbb r10,0
+
+ cmovc r12,rcx
+ cmovc r13,rbp
+ mov QWORD[rdi],r12
+ cmovc r8,rbx
+ mov QWORD[8+rdi],r13
+ cmovc r9,rdx
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+
+
+
+
+
+
+
+
+global ecp_nistz256_sqr_mont
+
+ALIGN 32
+ecp_nistz256_sqr_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_sqr_mont:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+_CET_ENDBR
+ lea rcx,[OPENSSL_ia32cap_P]
+ mov rcx,QWORD[8+rcx]
+ and ecx,0x80100
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$sqr_body:
+ cmp ecx,0x80100
+ je NEAR $L$sqr_montx
+ mov rax,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r15,QWORD[16+rsi]
+ mov r8,QWORD[24+rsi]
+
+ call __ecp_nistz256_sqr_montq
+ jmp NEAR $L$sqr_mont_done
+
+ALIGN 32
+$L$sqr_montx:
+ mov rdx,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r15,QWORD[16+rsi]
+ mov r8,QWORD[24+rsi]
+ lea rsi,[((-128))+rsi]
+
+ call __ecp_nistz256_sqr_montx
+$L$sqr_mont_done:
+ mov r15,QWORD[rsp]
+
+ mov r14,QWORD[8+rsp]
+
+ mov r13,QWORD[16+rsp]
+
+ mov r12,QWORD[24+rsp]
+
+ mov rbx,QWORD[32+rsp]
+
+ mov rbp,QWORD[40+rsp]
+
+ lea rsp,[48+rsp]
+
+$L$sqr_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_sqr_mont:
+
+
+ALIGN 32
+__ecp_nistz256_sqr_montq:
+
+ mov r13,rax
+ mul r14
+ mov r9,rax
+ mov rax,r15
+ mov r10,rdx
+
+ mul r13
+ add r10,rax
+ mov rax,r8
+ adc rdx,0
+ mov r11,rdx
+
+ mul r13
+ add r11,rax
+ mov rax,r15
+ adc rdx,0
+ mov r12,rdx
+
+
+ mul r14
+ add r11,rax
+ mov rax,r8
+ adc rdx,0
+ mov rbp,rdx
+
+ mul r14
+ add r12,rax
+ mov rax,r8
+ adc rdx,0
+ add r12,rbp
+ mov r13,rdx
+ adc r13,0
+
+
+ mul r15
+ xor r15,r15
+ add r13,rax
+ mov rax,QWORD[rsi]
+ mov r14,rdx
+ adc r14,0
+
+ add r9,r9
+ adc r10,r10
+ adc r11,r11
+ adc r12,r12
+ adc r13,r13
+ adc r14,r14
+ adc r15,0
+
+ mul rax
+ mov r8,rax
+ mov rax,QWORD[8+rsi]
+ mov rcx,rdx
+
+ mul rax
+ add r9,rcx
+ adc r10,rax
+ mov rax,QWORD[16+rsi]
+ adc rdx,0
+ mov rcx,rdx
+
+ mul rax
+ add r11,rcx
+ adc r12,rax
+ mov rax,QWORD[24+rsi]
+ adc rdx,0
+ mov rcx,rdx
+
+ mul rax
+ add r13,rcx
+ adc r14,rax
+ mov rax,r8
+ adc r15,rdx
+
+ mov rsi,QWORD[(($L$poly+8))]
+ mov rbp,QWORD[(($L$poly+24))]
+
+
+
+
+ mov rcx,r8
+ shl r8,32
+ mul rbp
+ shr rcx,32
+ add r9,r8
+ adc r10,rcx
+ adc r11,rax
+ mov rax,r9
+ adc rdx,0
+
+
+
+ mov rcx,r9
+ shl r9,32
+ mov r8,rdx
+ mul rbp
+ shr rcx,32
+ add r10,r9
+ adc r11,rcx
+ adc r8,rax
+ mov rax,r10
+ adc rdx,0
+
+
+
+ mov rcx,r10
+ shl r10,32
+ mov r9,rdx
+ mul rbp
+ shr rcx,32
+ add r11,r10
+ adc r8,rcx
+ adc r9,rax
+ mov rax,r11
+ adc rdx,0
+
+
+
+ mov rcx,r11
+ shl r11,32
+ mov r10,rdx
+ mul rbp
+ shr rcx,32
+ add r8,r11
+ adc r9,rcx
+ adc r10,rax
+ adc rdx,0
+ xor r11,r11
+
+
+
+ add r12,r8
+ adc r13,r9
+ mov r8,r12
+ adc r14,r10
+ adc r15,rdx
+ mov r9,r13
+ adc r11,0
+
+ sub r12,-1
+ mov r10,r14
+ sbb r13,rsi
+ sbb r14,0
+ mov rcx,r15
+ sbb r15,rbp
+ sbb r11,0
+
+ cmovc r12,r8
+ cmovc r13,r9
+ mov QWORD[rdi],r12
+ cmovc r14,r10
+ mov QWORD[8+rdi],r13
+ cmovc r15,rcx
+ mov QWORD[16+rdi],r14
+ mov QWORD[24+rdi],r15
+
+ ret
+
+
+
+ALIGN 32
+__ecp_nistz256_mul_montx:
+
+
+
+ mulx r9,r8,r9
+ mulx r10,rcx,r10
+ mov r14,32
+ xor r13,r13
+ mulx r11,rbp,r11
+ mov r15,QWORD[(($L$poly+24))]
+ adc r9,rcx
+ mulx r12,rcx,r12
+ mov rdx,r8
+ adc r10,rbp
+ shlx rbp,r8,r14
+ adc r11,rcx
+ shrx rcx,r8,r14
+ adc r12,0
+
+
+
+ add r9,rbp
+ adc r10,rcx
+
+ mulx rbp,rcx,r15
+ mov rdx,QWORD[8+rbx]
+ adc r11,rcx
+ adc r12,rbp
+ adc r13,0
+ xor r8,r8
+
+
+
+ mulx rbp,rcx,QWORD[((0+128))+rsi]
+ adcx r9,rcx
+ adox r10,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+rsi]
+ adcx r10,rcx
+ adox r11,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+rsi]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+rsi]
+ mov rdx,r9
+ adcx r12,rcx
+ shlx rcx,r9,r14
+ adox r13,rbp
+ shrx rbp,r9,r14
+
+ adcx r13,r8
+ adox r8,r8
+ adc r8,0
+
+
+
+ add r10,rcx
+ adc r11,rbp
+
+ mulx rbp,rcx,r15
+ mov rdx,QWORD[16+rbx]
+ adc r12,rcx
+ adc r13,rbp
+ adc r8,0
+ xor r9,r9
+
+
+
+ mulx rbp,rcx,QWORD[((0+128))+rsi]
+ adcx r10,rcx
+ adox r11,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+rsi]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+rsi]
+ adcx r12,rcx
+ adox r13,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+rsi]
+ mov rdx,r10
+ adcx r13,rcx
+ shlx rcx,r10,r14
+ adox r8,rbp
+ shrx rbp,r10,r14
+
+ adcx r8,r9
+ adox r9,r9
+ adc r9,0
+
+
+
+ add r11,rcx
+ adc r12,rbp
+
+ mulx rbp,rcx,r15
+ mov rdx,QWORD[24+rbx]
+ adc r13,rcx
+ adc r8,rbp
+ adc r9,0
+ xor r10,r10
+
+
+
+ mulx rbp,rcx,QWORD[((0+128))+rsi]
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,QWORD[((8+128))+rsi]
+ adcx r12,rcx
+ adox r13,rbp
+
+ mulx rbp,rcx,QWORD[((16+128))+rsi]
+ adcx r13,rcx
+ adox r8,rbp
+
+ mulx rbp,rcx,QWORD[((24+128))+rsi]
+ mov rdx,r11
+ adcx r8,rcx
+ shlx rcx,r11,r14
+ adox r9,rbp
+ shrx rbp,r11,r14
+
+ adcx r9,r10
+ adox r10,r10
+ adc r10,0
+
+
+
+ add r12,rcx
+ adc r13,rbp
+
+ mulx rbp,rcx,r15
+ mov rbx,r12
+ mov r14,QWORD[(($L$poly+8))]
+ adc r8,rcx
+ mov rdx,r13
+ adc r9,rbp
+ adc r10,0
+
+
+
+ xor eax,eax
+ mov rcx,r8
+ sbb r12,-1
+ sbb r13,r14
+ sbb r8,0
+ mov rbp,r9
+ sbb r9,r15
+ sbb r10,0
+
+ cmovc r12,rbx
+ cmovc r13,rdx
+ mov QWORD[rdi],r12
+ cmovc r8,rcx
+ mov QWORD[8+rdi],r13
+ cmovc r9,rbp
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+
+
+ALIGN 32
+__ecp_nistz256_sqr_montx:
+
+ mulx r10,r9,r14
+ mulx r11,rcx,r15
+ xor eax,eax
+ adc r10,rcx
+ mulx r12,rbp,r8
+ mov rdx,r14
+ adc r11,rbp
+ adc r12,0
+ xor r13,r13
+
+
+ mulx rbp,rcx,r15
+ adcx r11,rcx
+ adox r12,rbp
+
+ mulx rbp,rcx,r8
+ mov rdx,r15
+ adcx r12,rcx
+ adox r13,rbp
+ adc r13,0
+
+
+ mulx r14,rcx,r8
+ mov rdx,QWORD[((0+128))+rsi]
+ xor r15,r15
+ adcx r9,r9
+ adox r13,rcx
+ adcx r10,r10
+ adox r14,r15
+
+ mulx rbp,r8,rdx
+ mov rdx,QWORD[((8+128))+rsi]
+ adcx r11,r11
+ adox r9,rbp
+ adcx r12,r12
+ mulx rax,rcx,rdx
+ mov rdx,QWORD[((16+128))+rsi]
+ adcx r13,r13
+ adox r10,rcx
+ adcx r14,r14
+ DB 0x67
+ mulx rbp,rcx,rdx
+ mov rdx,QWORD[((24+128))+rsi]
+ adox r11,rax
+ adcx r15,r15
+ adox r12,rcx
+ mov rsi,32
+ adox r13,rbp
+ DB 0x67,0x67
+ mulx rax,rcx,rdx
+ mov rdx,QWORD[(($L$poly+24))]
+ adox r14,rcx
+ shlx rcx,r8,rsi
+ adox r15,rax
+ shrx rax,r8,rsi
+ mov rbp,rdx
+
+
+ add r9,rcx
+ adc r10,rax
+
+ mulx r8,rcx,r8
+ adc r11,rcx
+ shlx rcx,r9,rsi
+ adc r8,0
+ shrx rax,r9,rsi
+
+
+ add r10,rcx
+ adc r11,rax
+
+ mulx r9,rcx,r9
+ adc r8,rcx
+ shlx rcx,r10,rsi
+ adc r9,0
+ shrx rax,r10,rsi
+
+
+ add r11,rcx
+ adc r8,rax
+
+ mulx r10,rcx,r10
+ adc r9,rcx
+ shlx rcx,r11,rsi
+ adc r10,0
+ shrx rax,r11,rsi
+
+
+ add r8,rcx
+ adc r9,rax
+
+ mulx r11,rcx,r11
+ adc r10,rcx
+ adc r11,0
+
+ xor rdx,rdx
+ add r12,r8
+ mov rsi,QWORD[(($L$poly+8))]
+ adc r13,r9
+ mov r8,r12
+ adc r14,r10
+ adc r15,r11
+ mov r9,r13
+ adc rdx,0
+
+ sub r12,-1
+ mov r10,r14
+ sbb r13,rsi
+ sbb r14,0
+ mov r11,r15
+ sbb r15,rbp
+ sbb rdx,0
+
+ cmovc r12,r8
+ cmovc r13,r9
+ mov QWORD[rdi],r12
+ cmovc r14,r10
+ mov QWORD[8+rdi],r13
+ cmovc r15,r11
+ mov QWORD[16+rdi],r14
+ mov QWORD[24+rdi],r15
+
+ ret
+
+
+
+
+global ecp_nistz256_select_w5
+
+ALIGN 32
+ecp_nistz256_select_w5:
+
+_CET_ENDBR
+ lea rax,[OPENSSL_ia32cap_P]
+ mov rax,QWORD[8+rax]
+ test eax,32
+ jnz NEAR $L$avx2_select_w5
+ lea rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_select_w5:
+ DB 0x48,0x8d,0x60,0xe0
+ DB 0x0f,0x29,0x70,0xe0
+ DB 0x0f,0x29,0x78,0xf0
+ DB 0x44,0x0f,0x29,0x00
+ DB 0x44,0x0f,0x29,0x48,0x10
+ DB 0x44,0x0f,0x29,0x50,0x20
+ DB 0x44,0x0f,0x29,0x58,0x30
+ DB 0x44,0x0f,0x29,0x60,0x40
+ DB 0x44,0x0f,0x29,0x68,0x50
+ DB 0x44,0x0f,0x29,0x70,0x60
+ DB 0x44,0x0f,0x29,0x78,0x70
+ movdqa xmm0,XMMWORD[$L$One]
+ movd xmm1,r8d
+
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+
+ movdqa xmm8,xmm0
+ pshufd xmm1,xmm1,0
+
+ mov rax,16
+$L$select_loop_sse_w5:
+
+ movdqa xmm15,xmm8
+ paddd xmm8,xmm0
+ pcmpeqd xmm15,xmm1
+
+ movdqa xmm9,XMMWORD[rdx]
+ movdqa xmm10,XMMWORD[16+rdx]
+ movdqa xmm11,XMMWORD[32+rdx]
+ movdqa xmm12,XMMWORD[48+rdx]
+ movdqa xmm13,XMMWORD[64+rdx]
+ movdqa xmm14,XMMWORD[80+rdx]
+ lea rdx,[96+rdx]
+
+ pand xmm9,xmm15
+ pand xmm10,xmm15
+ por xmm2,xmm9
+ pand xmm11,xmm15
+ por xmm3,xmm10
+ pand xmm12,xmm15
+ por xmm4,xmm11
+ pand xmm13,xmm15
+ por xmm5,xmm12
+ pand xmm14,xmm15
+ por xmm6,xmm13
+ por xmm7,xmm14
+
+ dec rax
+ jnz NEAR $L$select_loop_sse_w5
+
+ movdqu XMMWORD[rcx],xmm2
+ movdqu XMMWORD[16+rcx],xmm3
+ movdqu XMMWORD[32+rcx],xmm4
+ movdqu XMMWORD[48+rcx],xmm5
+ movdqu XMMWORD[64+rcx],xmm6
+ movdqu XMMWORD[80+rcx],xmm7
+ movaps xmm6,XMMWORD[rsp]
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps xmm15,XMMWORD[144+rsp]
+ lea rsp,[168+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_select_w5:
+
+
+
+
+global ecp_nistz256_select_w7
+
+ALIGN 32
+ecp_nistz256_select_w7:
+
+_CET_ENDBR
+ lea rax,[OPENSSL_ia32cap_P]
+ mov rax,QWORD[8+rax]
+ test eax,32
+ jnz NEAR $L$avx2_select_w7
+ lea rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_select_w7:
+ DB 0x48,0x8d,0x60,0xe0
+ DB 0x0f,0x29,0x70,0xe0
+ DB 0x0f,0x29,0x78,0xf0
+ DB 0x44,0x0f,0x29,0x00
+ DB 0x44,0x0f,0x29,0x48,0x10
+ DB 0x44,0x0f,0x29,0x50,0x20
+ DB 0x44,0x0f,0x29,0x58,0x30
+ DB 0x44,0x0f,0x29,0x60,0x40
+ DB 0x44,0x0f,0x29,0x68,0x50
+ DB 0x44,0x0f,0x29,0x70,0x60
+ DB 0x44,0x0f,0x29,0x78,0x70
+ movdqa xmm8,XMMWORD[$L$One]
+ movd xmm1,r8d
+
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+
+ movdqa xmm0,xmm8
+ pshufd xmm1,xmm1,0
+ mov rax,64
+
+$L$select_loop_sse_w7:
+ movdqa xmm15,xmm8
+ paddd xmm8,xmm0
+ movdqa xmm9,XMMWORD[rdx]
+ movdqa xmm10,XMMWORD[16+rdx]
+ pcmpeqd xmm15,xmm1
+ movdqa xmm11,XMMWORD[32+rdx]
+ movdqa xmm12,XMMWORD[48+rdx]
+ lea rdx,[64+rdx]
+
+ pand xmm9,xmm15
+ pand xmm10,xmm15
+ por xmm2,xmm9
+ pand xmm11,xmm15
+ por xmm3,xmm10
+ pand xmm12,xmm15
+ por xmm4,xmm11
+ prefetcht0 [255+rdx]
+ por xmm5,xmm12
+
+ dec rax
+ jnz NEAR $L$select_loop_sse_w7
+
+ movdqu XMMWORD[rcx],xmm2
+ movdqu XMMWORD[16+rcx],xmm3
+ movdqu XMMWORD[32+rcx],xmm4
+ movdqu XMMWORD[48+rcx],xmm5
+ movaps xmm6,XMMWORD[rsp]
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps xmm15,XMMWORD[144+rsp]
+ lea rsp,[168+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_select_w7:
+
+
+
+
+ALIGN 32
+ecp_nistz256_avx2_select_w5:
+
+$L$avx2_select_w5:
+ vzeroupper
+ lea rax,[((-136))+rsp]
+ mov r11,rsp
+$L$SEH_begin_ecp_nistz256_avx2_select_w5:
+ DB 0x48,0x8d,0x60,0xe0
+ DB 0xc5,0xf8,0x29,0x70,0xe0
+ DB 0xc5,0xf8,0x29,0x78,0xf0
+ DB 0xc5,0x78,0x29,0x40,0x00
+ DB 0xc5,0x78,0x29,0x48,0x10
+ DB 0xc5,0x78,0x29,0x50,0x20
+ DB 0xc5,0x78,0x29,0x58,0x30
+ DB 0xc5,0x78,0x29,0x60,0x40
+ DB 0xc5,0x78,0x29,0x68,0x50
+ DB 0xc5,0x78,0x29,0x70,0x60
+ DB 0xc5,0x78,0x29,0x78,0x70
+ vmovdqa ymm0,YMMWORD[$L$Two]
+
+ vpxor ymm2,ymm2,ymm2
+ vpxor ymm3,ymm3,ymm3
+ vpxor ymm4,ymm4,ymm4
+
+ vmovdqa ymm5,YMMWORD[$L$One]
+ vmovdqa ymm10,YMMWORD[$L$Two]
+
+ vmovd xmm1,r8d
+ vpermd ymm1,ymm2,ymm1
+
+ mov rax,8
+$L$select_loop_avx2_w5:
+
+ vmovdqa ymm6,YMMWORD[rdx]
+ vmovdqa ymm7,YMMWORD[32+rdx]
+ vmovdqa ymm8,YMMWORD[64+rdx]
+
+ vmovdqa ymm11,YMMWORD[96+rdx]
+ vmovdqa ymm12,YMMWORD[128+rdx]
+ vmovdqa ymm13,YMMWORD[160+rdx]
+
+ vpcmpeqd ymm9,ymm5,ymm1
+ vpcmpeqd ymm14,ymm10,ymm1
+
+ vpaddd ymm5,ymm5,ymm0
+ vpaddd ymm10,ymm10,ymm0
+ lea rdx,[192+rdx]
+
+ vpand ymm6,ymm6,ymm9
+ vpand ymm7,ymm7,ymm9
+ vpand ymm8,ymm8,ymm9
+ vpand ymm11,ymm11,ymm14
+ vpand ymm12,ymm12,ymm14
+ vpand ymm13,ymm13,ymm14
+
+ vpxor ymm2,ymm2,ymm6
+ vpxor ymm3,ymm3,ymm7
+ vpxor ymm4,ymm4,ymm8
+ vpxor ymm2,ymm2,ymm11
+ vpxor ymm3,ymm3,ymm12
+ vpxor ymm4,ymm4,ymm13
+
+ dec rax
+ jnz NEAR $L$select_loop_avx2_w5
+
+ vmovdqu YMMWORD[rcx],ymm2
+ vmovdqu YMMWORD[32+rcx],ymm3
+ vmovdqu YMMWORD[64+rcx],ymm4
+ vzeroupper
+ movaps xmm6,XMMWORD[rsp]
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps xmm15,XMMWORD[144+rsp]
+ lea rsp,[r11]
+ ret
+
+$L$SEH_end_ecp_nistz256_avx2_select_w5:
+
+
+
+
+global ecp_nistz256_avx2_select_w7
+
+ALIGN 32
+ecp_nistz256_avx2_select_w7:
+
+$L$avx2_select_w7:
+_CET_ENDBR
+ vzeroupper
+ mov r11,rsp
+ lea rax,[((-136))+rsp]
+$L$SEH_begin_ecp_nistz256_avx2_select_w7:
+ DB 0x48,0x8d,0x60,0xe0
+ DB 0xc5,0xf8,0x29,0x70,0xe0
+ DB 0xc5,0xf8,0x29,0x78,0xf0
+ DB 0xc5,0x78,0x29,0x40,0x00
+ DB 0xc5,0x78,0x29,0x48,0x10
+ DB 0xc5,0x78,0x29,0x50,0x20
+ DB 0xc5,0x78,0x29,0x58,0x30
+ DB 0xc5,0x78,0x29,0x60,0x40
+ DB 0xc5,0x78,0x29,0x68,0x50
+ DB 0xc5,0x78,0x29,0x70,0x60
+ DB 0xc5,0x78,0x29,0x78,0x70
+ vmovdqa ymm0,YMMWORD[$L$Three]
+
+ vpxor ymm2,ymm2,ymm2
+ vpxor ymm3,ymm3,ymm3
+
+ vmovdqa ymm4,YMMWORD[$L$One]
+ vmovdqa ymm8,YMMWORD[$L$Two]
+ vmovdqa ymm12,YMMWORD[$L$Three]
+
+ vmovd xmm1,r8d
+ vpermd ymm1,ymm2,ymm1
+
+
+ mov rax,21
+$L$select_loop_avx2_w7:
+
+ vmovdqa ymm5,YMMWORD[rdx]
+ vmovdqa ymm6,YMMWORD[32+rdx]
+
+ vmovdqa ymm9,YMMWORD[64+rdx]
+ vmovdqa ymm10,YMMWORD[96+rdx]
+
+ vmovdqa ymm13,YMMWORD[128+rdx]
+ vmovdqa ymm14,YMMWORD[160+rdx]
+
+ vpcmpeqd ymm7,ymm4,ymm1
+ vpcmpeqd ymm11,ymm8,ymm1
+ vpcmpeqd ymm15,ymm12,ymm1
+
+ vpaddd ymm4,ymm4,ymm0
+ vpaddd ymm8,ymm8,ymm0
+ vpaddd ymm12,ymm12,ymm0
+ lea rdx,[192+rdx]
+
+ vpand ymm5,ymm5,ymm7
+ vpand ymm6,ymm6,ymm7
+ vpand ymm9,ymm9,ymm11
+ vpand ymm10,ymm10,ymm11
+ vpand ymm13,ymm13,ymm15
+ vpand ymm14,ymm14,ymm15
+
+ vpxor ymm2,ymm2,ymm5
+ vpxor ymm3,ymm3,ymm6
+ vpxor ymm2,ymm2,ymm9
+ vpxor ymm3,ymm3,ymm10
+ vpxor ymm2,ymm2,ymm13
+ vpxor ymm3,ymm3,ymm14
+
+ dec rax
+ jnz NEAR $L$select_loop_avx2_w7
+
+
+ vmovdqa ymm5,YMMWORD[rdx]
+ vmovdqa ymm6,YMMWORD[32+rdx]
+
+ vpcmpeqd ymm7,ymm4,ymm1
+
+ vpand ymm5,ymm5,ymm7
+ vpand ymm6,ymm6,ymm7
+
+ vpxor ymm2,ymm2,ymm5
+ vpxor ymm3,ymm3,ymm6
+
+ vmovdqu YMMWORD[rcx],ymm2
+ vmovdqu YMMWORD[32+rcx],ymm3
+ vzeroupper
+ movaps xmm6,XMMWORD[rsp]
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps xmm15,XMMWORD[144+rsp]
+ lea rsp,[r11]
+ ret
+
+$L$SEH_end_ecp_nistz256_avx2_select_w7:
+
+
+ALIGN 32
+__ecp_nistz256_add_toq:
+
+ xor r11,r11
+ add r12,QWORD[rbx]
+ adc r13,QWORD[8+rbx]
+ mov rax,r12
+ adc r8,QWORD[16+rbx]
+ adc r9,QWORD[24+rbx]
+ mov rbp,r13
+ adc r11,0
+
+ sub r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ cmovc r13,rbp
+ mov QWORD[rdi],r12
+ cmovc r8,rcx
+ mov QWORD[8+rdi],r13
+ cmovc r9,r10
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+
+
+ALIGN 32
+__ecp_nistz256_sub_fromq:
+
+ sub r12,QWORD[rbx]
+ sbb r13,QWORD[8+rbx]
+ mov rax,r12
+ sbb r8,QWORD[16+rbx]
+ sbb r9,QWORD[24+rbx]
+ mov rbp,r13
+ sbb r11,r11
+
+ add r12,-1
+ mov rcx,r8
+ adc r13,r14
+ adc r8,0
+ mov r10,r9
+ adc r9,r15
+ test r11,r11
+
+ cmovz r12,rax
+ cmovz r13,rbp
+ mov QWORD[rdi],r12
+ cmovz r8,rcx
+ mov QWORD[8+rdi],r13
+ cmovz r9,r10
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+
+
+ALIGN 32
+__ecp_nistz256_subq:
+
+ sub rax,r12
+ sbb rbp,r13
+ mov r12,rax
+ sbb rcx,r8
+ sbb r10,r9
+ mov r13,rbp
+ sbb r11,r11
+
+ add rax,-1
+ mov r8,rcx
+ adc rbp,r14
+ adc rcx,0
+ mov r9,r10
+ adc r10,r15
+ test r11,r11
+
+ cmovnz r12,rax
+ cmovnz r13,rbp
+ cmovnz r8,rcx
+ cmovnz r9,r10
+
+ ret
+
+
+
+
+ALIGN 32
+__ecp_nistz256_mul_by_2q:
+
+ xor r11,r11
+ add r12,r12
+ adc r13,r13
+ mov rax,r12
+ adc r8,r8
+ adc r9,r9
+ mov rbp,r13
+ adc r11,0
+
+ sub r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ cmovc r13,rbp
+ mov QWORD[rdi],r12
+ cmovc r8,rcx
+ mov QWORD[8+rdi],r13
+ cmovc r9,r10
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+global ecp_nistz256_point_double
+
+ALIGN 32
+ecp_nistz256_point_double:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_point_double:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+_CET_ENDBR
+ lea rcx,[OPENSSL_ia32cap_P]
+ mov rcx,QWORD[8+rcx]
+ and ecx,0x80100
+ cmp ecx,0x80100
+ je NEAR $L$point_doublex
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,32*5+8
+
+$L$point_doubleq_body:
+
+$L$point_double_shortcutq:
+ movdqu xmm0,XMMWORD[rsi]
+ mov rbx,rsi
+ movdqu xmm1,XMMWORD[16+rsi]
+ mov r12,QWORD[((32+0))+rsi]
+ mov r13,QWORD[((32+8))+rsi]
+ mov r8,QWORD[((32+16))+rsi]
+ mov r9,QWORD[((32+24))+rsi]
+ mov r14,QWORD[(($L$poly+8))]
+ mov r15,QWORD[(($L$poly+24))]
+ movdqa XMMWORD[96+rsp],xmm0
+ movdqa XMMWORD[(96+16)+rsp],xmm1
+ lea r10,[32+rdi]
+ lea r11,[64+rdi]
+DB 102,72,15,110,199
+DB 102,73,15,110,202
+DB 102,73,15,110,211
+
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_by_2q
+
+ mov rax,QWORD[((64+0))+rsi]
+ mov r14,QWORD[((64+8))+rsi]
+ mov r15,QWORD[((64+16))+rsi]
+ mov r8,QWORD[((64+24))+rsi]
+ lea rsi,[((64-0))+rsi]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ mov rax,QWORD[((0+0))+rsp]
+ mov r14,QWORD[((8+0))+rsp]
+ lea rsi,[((0+0))+rsp]
+ mov r15,QWORD[((16+0))+rsp]
+ mov r8,QWORD[((24+0))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_sqr_montq
+
+ mov rax,QWORD[32+rbx]
+ mov r9,QWORD[((64+0))+rbx]
+ mov r10,QWORD[((64+8))+rbx]
+ mov r11,QWORD[((64+16))+rbx]
+ mov r12,QWORD[((64+24))+rbx]
+ lea rsi,[((64-0))+rbx]
+ lea rbx,[32+rbx]
+DB 102,72,15,126,215
+ call __ecp_nistz256_mul_montq
+ call __ecp_nistz256_mul_by_2q
+
+ mov r12,QWORD[((96+0))+rsp]
+ mov r13,QWORD[((96+8))+rsp]
+ lea rbx,[64+rsp]
+ mov r8,QWORD[((96+16))+rsp]
+ mov r9,QWORD[((96+24))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_add_toq
+
+ mov r12,QWORD[((96+0))+rsp]
+ mov r13,QWORD[((96+8))+rsp]
+ lea rbx,[64+rsp]
+ mov r8,QWORD[((96+16))+rsp]
+ mov r9,QWORD[((96+24))+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sub_fromq
+
+ mov rax,QWORD[((0+0))+rsp]
+ mov r14,QWORD[((8+0))+rsp]
+ lea rsi,[((0+0))+rsp]
+ mov r15,QWORD[((16+0))+rsp]
+ mov r8,QWORD[((24+0))+rsp]
+DB 102,72,15,126,207
+ call __ecp_nistz256_sqr_montq
+ xor r9,r9
+ mov rax,r12
+ add r12,-1
+ mov r10,r13
+ adc r13,rsi
+ mov rcx,r14
+ adc r14,0
+ mov r8,r15
+ adc r15,rbp
+ adc r9,0
+ xor rsi,rsi
+ test rax,1
+
+ cmovz r12,rax
+ cmovz r13,r10
+ cmovz r14,rcx
+ cmovz r15,r8
+ cmovz r9,rsi
+
+ mov rax,r13
+ shr r12,1
+ shl rax,63
+ mov r10,r14
+ shr r13,1
+ or r12,rax
+ shl r10,63
+ mov rcx,r15
+ shr r14,1
+ or r13,r10
+ shl rcx,63
+ mov QWORD[rdi],r12
+ shr r15,1
+ mov QWORD[8+rdi],r13
+ shl r9,63
+ or r14,rcx
+ or r15,r9
+ mov QWORD[16+rdi],r14
+ mov QWORD[24+rdi],r15
+ mov rax,QWORD[64+rsp]
+ lea rbx,[64+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_mul_by_2q
+
+ lea rbx,[32+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_add_toq
+
+ mov rax,QWORD[96+rsp]
+ lea rbx,[96+rsp]
+ mov r9,QWORD[((0+0))+rsp]
+ mov r10,QWORD[((8+0))+rsp]
+ lea rsi,[((0+0))+rsp]
+ mov r11,QWORD[((16+0))+rsp]
+ mov r12,QWORD[((24+0))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_mul_by_2q
+
+ mov rax,QWORD[((0+32))+rsp]
+ mov r14,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r15,QWORD[((16+32))+rsp]
+ mov r8,QWORD[((24+32))+rsp]
+DB 102,72,15,126,199
+ call __ecp_nistz256_sqr_montq
+
+ lea rbx,[128+rsp]
+ mov r8,r14
+ mov r9,r15
+ mov r14,rsi
+ mov r15,rbp
+ call __ecp_nistz256_sub_fromq
+
+ mov rax,QWORD[((0+0))+rsp]
+ mov rbp,QWORD[((0+8))+rsp]
+ mov rcx,QWORD[((0+16))+rsp]
+ mov r10,QWORD[((0+24))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_subq
+
+ mov rax,QWORD[32+rsp]
+ lea rbx,[32+rsp]
+ mov r14,r12
+ xor ecx,ecx
+ mov QWORD[((0+0))+rsp],r12
+ mov r10,r13
+ mov QWORD[((0+8))+rsp],r13
+ cmovz r11,r8
+ mov QWORD[((0+16))+rsp],r8
+ lea rsi,[((0-0))+rsp]
+ cmovz r12,r9
+ mov QWORD[((0+24))+rsp],r9
+ mov r9,r14
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montq
+
+DB 102,72,15,126,203
+DB 102,72,15,126,207
+ call __ecp_nistz256_sub_fromq
+
+ lea rsi,[((160+56))+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbx,QWORD[((-16))+rsi]
+
+ mov rbp,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$point_doubleq_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_point_double:
+global ecp_nistz256_point_add
+
+ALIGN 32
+ecp_nistz256_point_add:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rcx,[OPENSSL_ia32cap_P]
+ mov rcx,QWORD[8+rcx]
+ and ecx,0x80100
+ cmp ecx,0x80100
+ je NEAR $L$point_addx
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,32*18+8
+
+$L$point_addq_body:
+
+ movdqu xmm0,XMMWORD[rsi]
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm3,XMMWORD[48+rsi]
+ movdqu xmm4,XMMWORD[64+rsi]
+ movdqu xmm5,XMMWORD[80+rsi]
+ mov rbx,rsi
+ mov rsi,rdx
+ movdqa XMMWORD[384+rsp],xmm0
+ movdqa XMMWORD[(384+16)+rsp],xmm1
+ movdqa XMMWORD[416+rsp],xmm2
+ movdqa XMMWORD[(416+16)+rsp],xmm3
+ movdqa XMMWORD[448+rsp],xmm4
+ movdqa XMMWORD[(448+16)+rsp],xmm5
+ por xmm5,xmm4
+
+ movdqu xmm0,XMMWORD[rsi]
+ pshufd xmm3,xmm5,0xb1
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ por xmm5,xmm3
+ movdqu xmm3,XMMWORD[48+rsi]
+ mov rax,QWORD[((64+0))+rsi]
+ mov r14,QWORD[((64+8))+rsi]
+ mov r15,QWORD[((64+16))+rsi]
+ mov r8,QWORD[((64+24))+rsi]
+ movdqa XMMWORD[480+rsp],xmm0
+ pshufd xmm4,xmm5,0x1e
+ movdqa XMMWORD[(480+16)+rsp],xmm1
+ movdqu xmm0,XMMWORD[64+rsi]
+ movdqu xmm1,XMMWORD[80+rsi]
+ movdqa XMMWORD[512+rsp],xmm2
+ movdqa XMMWORD[(512+16)+rsp],xmm3
+ por xmm5,xmm4
+ pxor xmm4,xmm4
+ por xmm1,xmm0
+DB 102,72,15,110,199
+
+ lea rsi,[((64-0))+rsi]
+ mov QWORD[((544+0))+rsp],rax
+ mov QWORD[((544+8))+rsp],r14
+ mov QWORD[((544+16))+rsp],r15
+ mov QWORD[((544+24))+rsp],r8
+ lea rdi,[96+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd xmm5,xmm4
+ pshufd xmm4,xmm1,0xb1
+ por xmm4,xmm1
+ pshufd xmm5,xmm5,0
+ pshufd xmm3,xmm4,0x1e
+ por xmm4,xmm3
+ pxor xmm3,xmm3
+ pcmpeqd xmm4,xmm3
+ pshufd xmm4,xmm4,0
+ mov rax,QWORD[((64+0))+rbx]
+ mov r14,QWORD[((64+8))+rbx]
+ mov r15,QWORD[((64+16))+rbx]
+ mov r8,QWORD[((64+24))+rbx]
+DB 102,72,15,110,203
+
+ lea rsi,[((64-0))+rbx]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ mov rax,QWORD[544+rsp]
+ lea rbx,[544+rsp]
+ mov r9,QWORD[((0+96))+rsp]
+ mov r10,QWORD[((8+96))+rsp]
+ lea rsi,[((0+96))+rsp]
+ mov r11,QWORD[((16+96))+rsp]
+ mov r12,QWORD[((24+96))+rsp]
+ lea rdi,[224+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[448+rsp]
+ lea rbx,[448+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[416+rsp]
+ lea rbx,[416+rsp]
+ mov r9,QWORD[((0+224))+rsp]
+ mov r10,QWORD[((8+224))+rsp]
+ lea rsi,[((0+224))+rsp]
+ mov r11,QWORD[((16+224))+rsp]
+ mov r12,QWORD[((24+224))+rsp]
+ lea rdi,[224+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[512+rsp]
+ lea rbx,[512+rsp]
+ mov r9,QWORD[((0+256))+rsp]
+ mov r10,QWORD[((8+256))+rsp]
+ lea rsi,[((0+256))+rsp]
+ mov r11,QWORD[((16+256))+rsp]
+ mov r12,QWORD[((24+256))+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rbx,[224+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sub_fromq
+
+ or r12,r13
+ movdqa xmm2,xmm4
+ or r12,r8
+ or r12,r9
+ por xmm2,xmm5
+DB 102,73,15,110,220
+
+ mov rax,QWORD[384+rsp]
+ lea rbx,[384+rsp]
+ mov r9,QWORD[((0+96))+rsp]
+ mov r10,QWORD[((8+96))+rsp]
+ lea rsi,[((0+96))+rsp]
+ mov r11,QWORD[((16+96))+rsp]
+ mov r12,QWORD[((24+96))+rsp]
+ lea rdi,[160+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[480+rsp]
+ lea rbx,[480+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[192+rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rbx,[160+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_sub_fromq
+
+ or r12,r13
+ or r12,r8
+ or r12,r9
+
+DB 102,73,15,126,208
+DB 102,73,15,126,217
+ or r12,r8
+ DB 0x3e
+ jnz NEAR $L$add_proceedq
+
+
+
+ test r9,r9
+ jz NEAR $L$add_doubleq
+
+
+
+
+
+
+DB 102,72,15,126,199
+ pxor xmm0,xmm0
+ movdqu XMMWORD[rdi],xmm0
+ movdqu XMMWORD[16+rdi],xmm0
+ movdqu XMMWORD[32+rdi],xmm0
+ movdqu XMMWORD[48+rdi],xmm0
+ movdqu XMMWORD[64+rdi],xmm0
+ movdqu XMMWORD[80+rdi],xmm0
+ jmp NEAR $L$add_doneq
+
+ALIGN 32
+$L$add_doubleq:
+DB 102,72,15,126,206
+DB 102,72,15,126,199
+ add rsp,416
+
+ jmp NEAR $L$point_double_shortcutq
+
+
+ALIGN 32
+$L$add_proceedq:
+ mov rax,QWORD[((0+64))+rsp]
+ mov r14,QWORD[((8+64))+rsp]
+ lea rsi,[((0+64))+rsp]
+ mov r15,QWORD[((16+64))+rsp]
+ mov r8,QWORD[((24+64))+rsp]
+ lea rdi,[96+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ mov rax,QWORD[448+rsp]
+ lea rbx,[448+rsp]
+ mov r9,QWORD[((0+0))+rsp]
+ mov r10,QWORD[((8+0))+rsp]
+ lea rsi,[((0+0))+rsp]
+ mov r11,QWORD[((16+0))+rsp]
+ mov r12,QWORD[((24+0))+rsp]
+ lea rdi,[352+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[((0+0))+rsp]
+ mov r14,QWORD[((8+0))+rsp]
+ lea rsi,[((0+0))+rsp]
+ mov r15,QWORD[((16+0))+rsp]
+ mov r8,QWORD[((24+0))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ mov rax,QWORD[544+rsp]
+ lea rbx,[544+rsp]
+ mov r9,QWORD[((0+352))+rsp]
+ mov r10,QWORD[((8+352))+rsp]
+ lea rsi,[((0+352))+rsp]
+ mov r11,QWORD[((16+352))+rsp]
+ mov r12,QWORD[((24+352))+rsp]
+ lea rdi,[352+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[rsp]
+ lea rbx,[rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[160+rsp]
+ lea rbx,[160+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[192+rsp]
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xor r11,r11
+ add r12,r12
+ lea rsi,[96+rsp]
+ adc r13,r13
+ mov rax,r12
+ adc r8,r8
+ adc r9,r9
+ mov rbp,r13
+ adc r11,0
+
+ sub r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ mov rax,QWORD[rsi]
+ cmovc r13,rbp
+ mov rbp,QWORD[8+rsi]
+ cmovc r8,rcx
+ mov rcx,QWORD[16+rsi]
+ cmovc r9,r10
+ mov r10,QWORD[24+rsi]
+
+ call __ecp_nistz256_subq
+
+ lea rbx,[128+rsp]
+ lea rdi,[288+rsp]
+ call __ecp_nistz256_sub_fromq
+
+ mov rax,QWORD[((192+0))+rsp]
+ mov rbp,QWORD[((192+8))+rsp]
+ mov rcx,QWORD[((192+16))+rsp]
+ mov r10,QWORD[((192+24))+rsp]
+ lea rdi,[320+rsp]
+
+ call __ecp_nistz256_subq
+
+ mov QWORD[rdi],r12
+ mov QWORD[8+rdi],r13
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+ mov rax,QWORD[128+rsp]
+ lea rbx,[128+rsp]
+ mov r9,QWORD[((0+224))+rsp]
+ mov r10,QWORD[((8+224))+rsp]
+ lea rsi,[((0+224))+rsp]
+ mov r11,QWORD[((16+224))+rsp]
+ mov r12,QWORD[((24+224))+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[320+rsp]
+ lea rbx,[320+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((0+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[320+rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rbx,[256+rsp]
+ lea rdi,[320+rsp]
+ call __ecp_nistz256_sub_fromq
+
+DB 102,72,15,126,199
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[352+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((352+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[544+rsp]
+ pand xmm3,XMMWORD[((544+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[448+rsp]
+ pand xmm3,XMMWORD[((448+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[64+rdi],xmm2
+ movdqu XMMWORD[80+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[288+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((288+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[480+rsp]
+ pand xmm3,XMMWORD[((480+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[384+rsp]
+ pand xmm3,XMMWORD[((384+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[rdi],xmm2
+ movdqu XMMWORD[16+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[320+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((320+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[512+rsp]
+ pand xmm3,XMMWORD[((512+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[416+rsp]
+ pand xmm3,XMMWORD[((416+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu XMMWORD[48+rdi],xmm3
+
+$L$add_doneq:
+ lea rsi,[((576+56))+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbx,QWORD[((-16))+rsi]
+
+ mov rbp,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$point_addq_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_point_add:
+global ecp_nistz256_point_add_affine
+
+ALIGN 32
+ecp_nistz256_point_add_affine:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_affine:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rcx,[OPENSSL_ia32cap_P]
+ mov rcx,QWORD[8+rcx]
+ and ecx,0x80100
+ cmp ecx,0x80100
+ je NEAR $L$point_add_affinex
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,32*15+8
+
+$L$add_affineq_body:
+
+ movdqu xmm0,XMMWORD[rsi]
+ mov rbx,rdx
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm3,XMMWORD[48+rsi]
+ movdqu xmm4,XMMWORD[64+rsi]
+ movdqu xmm5,XMMWORD[80+rsi]
+ mov rax,QWORD[((64+0))+rsi]
+ mov r14,QWORD[((64+8))+rsi]
+ mov r15,QWORD[((64+16))+rsi]
+ mov r8,QWORD[((64+24))+rsi]
+ movdqa XMMWORD[320+rsp],xmm0
+ movdqa XMMWORD[(320+16)+rsp],xmm1
+ movdqa XMMWORD[352+rsp],xmm2
+ movdqa XMMWORD[(352+16)+rsp],xmm3
+ movdqa XMMWORD[384+rsp],xmm4
+ movdqa XMMWORD[(384+16)+rsp],xmm5
+ por xmm5,xmm4
+
+ movdqu xmm0,XMMWORD[rbx]
+ pshufd xmm3,xmm5,0xb1
+ movdqu xmm1,XMMWORD[16+rbx]
+ movdqu xmm2,XMMWORD[32+rbx]
+ por xmm5,xmm3
+ movdqu xmm3,XMMWORD[48+rbx]
+ movdqa XMMWORD[416+rsp],xmm0
+ pshufd xmm4,xmm5,0x1e
+ movdqa XMMWORD[(416+16)+rsp],xmm1
+ por xmm1,xmm0
+DB 102,72,15,110,199
+ movdqa XMMWORD[448+rsp],xmm2
+ movdqa XMMWORD[(448+16)+rsp],xmm3
+ por xmm3,xmm2
+ por xmm5,xmm4
+ pxor xmm4,xmm4
+ por xmm3,xmm1
+
+ lea rsi,[((64-0))+rsi]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd xmm5,xmm4
+ pshufd xmm4,xmm3,0xb1
+ mov rax,QWORD[rbx]
+
+ mov r9,r12
+ por xmm4,xmm3
+ pshufd xmm5,xmm5,0
+ pshufd xmm3,xmm4,0x1e
+ mov r10,r13
+ por xmm4,xmm3
+ pxor xmm3,xmm3
+ mov r11,r14
+ pcmpeqd xmm4,xmm3
+ pshufd xmm4,xmm4,0
+
+ lea rsi,[((32-0))+rsp]
+ mov r12,r15
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rbx,[320+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sub_fromq
+
+ mov rax,QWORD[384+rsp]
+ lea rbx,[384+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[384+rsp]
+ lea rbx,[384+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((0+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[288+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[448+rsp]
+ lea rbx,[448+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((0+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rbx,[352+rsp]
+ lea rdi,[96+rsp]
+ call __ecp_nistz256_sub_fromq
+
+ mov rax,QWORD[((0+64))+rsp]
+ mov r14,QWORD[((8+64))+rsp]
+ lea rsi,[((0+64))+rsp]
+ mov r15,QWORD[((16+64))+rsp]
+ mov r8,QWORD[((24+64))+rsp]
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ mov rax,QWORD[((0+96))+rsp]
+ mov r14,QWORD[((8+96))+rsp]
+ lea rsi,[((0+96))+rsp]
+ mov r15,QWORD[((16+96))+rsp]
+ mov r8,QWORD[((24+96))+rsp]
+ lea rdi,[192+rsp]
+ call __ecp_nistz256_sqr_montq
+
+ mov rax,QWORD[128+rsp]
+ lea rbx,[128+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((0+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[160+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[320+rsp]
+ lea rbx,[320+rsp]
+ mov r9,QWORD[((0+128))+rsp]
+ mov r10,QWORD[((8+128))+rsp]
+ lea rsi,[((0+128))+rsp]
+ mov r11,QWORD[((16+128))+rsp]
+ mov r12,QWORD[((24+128))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xor r11,r11
+ add r12,r12
+ lea rsi,[192+rsp]
+ adc r13,r13
+ mov rax,r12
+ adc r8,r8
+ adc r9,r9
+ mov rbp,r13
+ adc r11,0
+
+ sub r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ mov rax,QWORD[rsi]
+ cmovc r13,rbp
+ mov rbp,QWORD[8+rsi]
+ cmovc r8,rcx
+ mov rcx,QWORD[16+rsi]
+ cmovc r9,r10
+ mov r10,QWORD[24+rsi]
+
+ call __ecp_nistz256_subq
+
+ lea rbx,[160+rsp]
+ lea rdi,[224+rsp]
+ call __ecp_nistz256_sub_fromq
+
+ mov rax,QWORD[((0+0))+rsp]
+ mov rbp,QWORD[((0+8))+rsp]
+ mov rcx,QWORD[((0+16))+rsp]
+ mov r10,QWORD[((0+24))+rsp]
+ lea rdi,[64+rsp]
+
+ call __ecp_nistz256_subq
+
+ mov QWORD[rdi],r12
+ mov QWORD[8+rdi],r13
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+ mov rax,QWORD[352+rsp]
+ lea rbx,[352+rsp]
+ mov r9,QWORD[((0+160))+rsp]
+ mov r10,QWORD[((8+160))+rsp]
+ lea rsi,[((0+160))+rsp]
+ mov r11,QWORD[((16+160))+rsp]
+ mov r12,QWORD[((24+160))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montq
+
+ mov rax,QWORD[96+rsp]
+ lea rbx,[96+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((0+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_mul_montq
+
+ lea rbx,[32+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_sub_fromq
+
+DB 102,72,15,126,199
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[288+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((288+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[$L$ONE_mont]
+ pand xmm3,XMMWORD[(($L$ONE_mont+16))]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[384+rsp]
+ pand xmm3,XMMWORD[((384+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[64+rdi],xmm2
+ movdqu XMMWORD[80+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[224+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((224+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[416+rsp]
+ pand xmm3,XMMWORD[((416+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[320+rsp]
+ pand xmm3,XMMWORD[((320+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[rdi],xmm2
+ movdqu XMMWORD[16+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[256+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((256+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[448+rsp]
+ pand xmm3,XMMWORD[((448+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[352+rsp]
+ pand xmm3,XMMWORD[((352+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu XMMWORD[48+rdi],xmm3
+
+ lea rsi,[((480+56))+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbx,QWORD[((-16))+rsi]
+
+ mov rbp,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$add_affineq_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_point_add_affine:
+
+ALIGN 32
+__ecp_nistz256_add_tox:
+
+ xor r11,r11
+ adc r12,QWORD[rbx]
+ adc r13,QWORD[8+rbx]
+ mov rax,r12
+ adc r8,QWORD[16+rbx]
+ adc r9,QWORD[24+rbx]
+ mov rbp,r13
+ adc r11,0
+
+ xor r10,r10
+ sbb r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ cmovc r13,rbp
+ mov QWORD[rdi],r12
+ cmovc r8,rcx
+ mov QWORD[8+rdi],r13
+ cmovc r9,r10
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+
+
+ALIGN 32
+__ecp_nistz256_sub_fromx:
+
+ xor r11,r11
+ sbb r12,QWORD[rbx]
+ sbb r13,QWORD[8+rbx]
+ mov rax,r12
+ sbb r8,QWORD[16+rbx]
+ sbb r9,QWORD[24+rbx]
+ mov rbp,r13
+ sbb r11,0
+
+ xor r10,r10
+ adc r12,-1
+ mov rcx,r8
+ adc r13,r14
+ adc r8,0
+ mov r10,r9
+ adc r9,r15
+
+ bt r11,0
+ cmovnc r12,rax
+ cmovnc r13,rbp
+ mov QWORD[rdi],r12
+ cmovnc r8,rcx
+ mov QWORD[8+rdi],r13
+ cmovnc r9,r10
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+
+
+ALIGN 32
+__ecp_nistz256_subx:
+
+ xor r11,r11
+ sbb rax,r12
+ sbb rbp,r13
+ mov r12,rax
+ sbb rcx,r8
+ sbb r10,r9
+ mov r13,rbp
+ sbb r11,0
+
+ xor r9,r9
+ adc rax,-1
+ mov r8,rcx
+ adc rbp,r14
+ adc rcx,0
+ mov r9,r10
+ adc r10,r15
+
+ bt r11,0
+ cmovc r12,rax
+ cmovc r13,rbp
+ cmovc r8,rcx
+ cmovc r9,r10
+
+ ret
+
+
+
+
+ALIGN 32
+__ecp_nistz256_mul_by_2x:
+
+ xor r11,r11
+ adc r12,r12
+ adc r13,r13
+ mov rax,r12
+ adc r8,r8
+ adc r9,r9
+ mov rbp,r13
+ adc r11,0
+
+ xor r10,r10
+ sbb r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ cmovc r13,rbp
+ mov QWORD[rdi],r12
+ cmovc r8,rcx
+ mov QWORD[8+rdi],r13
+ cmovc r9,r10
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+
+ ret
+
+
+
+ALIGN 32
+ecp_nistz256_point_doublex:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_point_doublex:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+$L$point_doublex:
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,32*5+8
+
+$L$point_doublex_body:
+
+$L$point_double_shortcutx:
+ movdqu xmm0,XMMWORD[rsi]
+ mov rbx,rsi
+ movdqu xmm1,XMMWORD[16+rsi]
+ mov r12,QWORD[((32+0))+rsi]
+ mov r13,QWORD[((32+8))+rsi]
+ mov r8,QWORD[((32+16))+rsi]
+ mov r9,QWORD[((32+24))+rsi]
+ mov r14,QWORD[(($L$poly+8))]
+ mov r15,QWORD[(($L$poly+24))]
+ movdqa XMMWORD[96+rsp],xmm0
+ movdqa XMMWORD[(96+16)+rsp],xmm1
+ lea r10,[32+rdi]
+ lea r11,[64+rdi]
+DB 102,72,15,110,199
+DB 102,73,15,110,202
+DB 102,73,15,110,211
+
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_by_2x
+
+ mov rdx,QWORD[((64+0))+rsi]
+ mov r14,QWORD[((64+8))+rsi]
+ mov r15,QWORD[((64+16))+rsi]
+ mov r8,QWORD[((64+24))+rsi]
+ lea rsi,[((64-128))+rsi]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ mov rdx,QWORD[((0+0))+rsp]
+ mov r14,QWORD[((8+0))+rsp]
+ lea rsi,[((-128+0))+rsp]
+ mov r15,QWORD[((16+0))+rsp]
+ mov r8,QWORD[((24+0))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_sqr_montx
+
+ mov rdx,QWORD[32+rbx]
+ mov r9,QWORD[((64+0))+rbx]
+ mov r10,QWORD[((64+8))+rbx]
+ mov r11,QWORD[((64+16))+rbx]
+ mov r12,QWORD[((64+24))+rbx]
+ lea rsi,[((64-128))+rbx]
+ lea rbx,[32+rbx]
+DB 102,72,15,126,215
+ call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_by_2x
+
+ mov r12,QWORD[((96+0))+rsp]
+ mov r13,QWORD[((96+8))+rsp]
+ lea rbx,[64+rsp]
+ mov r8,QWORD[((96+16))+rsp]
+ mov r9,QWORD[((96+24))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_add_tox
+
+ mov r12,QWORD[((96+0))+rsp]
+ mov r13,QWORD[((96+8))+rsp]
+ lea rbx,[64+rsp]
+ mov r8,QWORD[((96+16))+rsp]
+ mov r9,QWORD[((96+24))+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sub_fromx
+
+ mov rdx,QWORD[((0+0))+rsp]
+ mov r14,QWORD[((8+0))+rsp]
+ lea rsi,[((-128+0))+rsp]
+ mov r15,QWORD[((16+0))+rsp]
+ mov r8,QWORD[((24+0))+rsp]
+DB 102,72,15,126,207
+ call __ecp_nistz256_sqr_montx
+ xor r9,r9
+ mov rax,r12
+ add r12,-1
+ mov r10,r13
+ adc r13,rsi
+ mov rcx,r14
+ adc r14,0
+ mov r8,r15
+ adc r15,rbp
+ adc r9,0
+ xor rsi,rsi
+ test rax,1
+
+ cmovz r12,rax
+ cmovz r13,r10
+ cmovz r14,rcx
+ cmovz r15,r8
+ cmovz r9,rsi
+
+ mov rax,r13
+ shr r12,1
+ shl rax,63
+ mov r10,r14
+ shr r13,1
+ or r12,rax
+ shl r10,63
+ mov rcx,r15
+ shr r14,1
+ or r13,r10
+ shl rcx,63
+ mov QWORD[rdi],r12
+ shr r15,1
+ mov QWORD[8+rdi],r13
+ shl r9,63
+ or r14,rcx
+ or r15,r9
+ mov QWORD[16+rdi],r14
+ mov QWORD[24+rdi],r15
+ mov rdx,QWORD[64+rsp]
+ lea rbx,[64+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_mul_by_2x
+
+ lea rbx,[32+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_add_tox
+
+ mov rdx,QWORD[96+rsp]
+ lea rbx,[96+rsp]
+ mov r9,QWORD[((0+0))+rsp]
+ mov r10,QWORD[((8+0))+rsp]
+ lea rsi,[((-128+0))+rsp]
+ mov r11,QWORD[((16+0))+rsp]
+ mov r12,QWORD[((24+0))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_mul_by_2x
+
+ mov rdx,QWORD[((0+32))+rsp]
+ mov r14,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r15,QWORD[((16+32))+rsp]
+ mov r8,QWORD[((24+32))+rsp]
+DB 102,72,15,126,199
+ call __ecp_nistz256_sqr_montx
+
+ lea rbx,[128+rsp]
+ mov r8,r14
+ mov r9,r15
+ mov r14,rsi
+ mov r15,rbp
+ call __ecp_nistz256_sub_fromx
+
+ mov rax,QWORD[((0+0))+rsp]
+ mov rbp,QWORD[((0+8))+rsp]
+ mov rcx,QWORD[((0+16))+rsp]
+ mov r10,QWORD[((0+24))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_subx
+
+ mov rdx,QWORD[32+rsp]
+ lea rbx,[32+rsp]
+ mov r14,r12
+ xor ecx,ecx
+ mov QWORD[((0+0))+rsp],r12
+ mov r10,r13
+ mov QWORD[((0+8))+rsp],r13
+ cmovz r11,r8
+ mov QWORD[((0+16))+rsp],r8
+ lea rsi,[((0-128))+rsp]
+ cmovz r12,r9
+ mov QWORD[((0+24))+rsp],r9
+ mov r9,r14
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montx
+
+DB 102,72,15,126,203
+DB 102,72,15,126,207
+ call __ecp_nistz256_sub_fromx
+
+ lea rsi,[((160+56))+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbx,QWORD[((-16))+rsi]
+
+ mov rbp,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$point_doublex_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_point_doublex:
+
+ALIGN 32
+ecp_nistz256_point_addx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_point_addx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$point_addx:
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,32*18+8
+
+$L$point_addx_body:
+
+ movdqu xmm0,XMMWORD[rsi]
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm3,XMMWORD[48+rsi]
+ movdqu xmm4,XMMWORD[64+rsi]
+ movdqu xmm5,XMMWORD[80+rsi]
+ mov rbx,rsi
+ mov rsi,rdx
+ movdqa XMMWORD[384+rsp],xmm0
+ movdqa XMMWORD[(384+16)+rsp],xmm1
+ movdqa XMMWORD[416+rsp],xmm2
+ movdqa XMMWORD[(416+16)+rsp],xmm3
+ movdqa XMMWORD[448+rsp],xmm4
+ movdqa XMMWORD[(448+16)+rsp],xmm5
+ por xmm5,xmm4
+
+ movdqu xmm0,XMMWORD[rsi]
+ pshufd xmm3,xmm5,0xb1
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ por xmm5,xmm3
+ movdqu xmm3,XMMWORD[48+rsi]
+ mov rdx,QWORD[((64+0))+rsi]
+ mov r14,QWORD[((64+8))+rsi]
+ mov r15,QWORD[((64+16))+rsi]
+ mov r8,QWORD[((64+24))+rsi]
+ movdqa XMMWORD[480+rsp],xmm0
+ pshufd xmm4,xmm5,0x1e
+ movdqa XMMWORD[(480+16)+rsp],xmm1
+ movdqu xmm0,XMMWORD[64+rsi]
+ movdqu xmm1,XMMWORD[80+rsi]
+ movdqa XMMWORD[512+rsp],xmm2
+ movdqa XMMWORD[(512+16)+rsp],xmm3
+ por xmm5,xmm4
+ pxor xmm4,xmm4
+ por xmm1,xmm0
+DB 102,72,15,110,199
+
+ lea rsi,[((64-128))+rsi]
+ mov QWORD[((544+0))+rsp],rdx
+ mov QWORD[((544+8))+rsp],r14
+ mov QWORD[((544+16))+rsp],r15
+ mov QWORD[((544+24))+rsp],r8
+ lea rdi,[96+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd xmm5,xmm4
+ pshufd xmm4,xmm1,0xb1
+ por xmm4,xmm1
+ pshufd xmm5,xmm5,0
+ pshufd xmm3,xmm4,0x1e
+ por xmm4,xmm3
+ pxor xmm3,xmm3
+ pcmpeqd xmm4,xmm3
+ pshufd xmm4,xmm4,0
+ mov rdx,QWORD[((64+0))+rbx]
+ mov r14,QWORD[((64+8))+rbx]
+ mov r15,QWORD[((64+16))+rbx]
+ mov r8,QWORD[((64+24))+rbx]
+DB 102,72,15,110,203
+
+ lea rsi,[((64-128))+rbx]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ mov rdx,QWORD[544+rsp]
+ lea rbx,[544+rsp]
+ mov r9,QWORD[((0+96))+rsp]
+ mov r10,QWORD[((8+96))+rsp]
+ lea rsi,[((-128+96))+rsp]
+ mov r11,QWORD[((16+96))+rsp]
+ mov r12,QWORD[((24+96))+rsp]
+ lea rdi,[224+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[448+rsp]
+ lea rbx,[448+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[416+rsp]
+ lea rbx,[416+rsp]
+ mov r9,QWORD[((0+224))+rsp]
+ mov r10,QWORD[((8+224))+rsp]
+ lea rsi,[((-128+224))+rsp]
+ mov r11,QWORD[((16+224))+rsp]
+ mov r12,QWORD[((24+224))+rsp]
+ lea rdi,[224+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[512+rsp]
+ lea rbx,[512+rsp]
+ mov r9,QWORD[((0+256))+rsp]
+ mov r10,QWORD[((8+256))+rsp]
+ lea rsi,[((-128+256))+rsp]
+ mov r11,QWORD[((16+256))+rsp]
+ mov r12,QWORD[((24+256))+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rbx,[224+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sub_fromx
+
+ or r12,r13
+ movdqa xmm2,xmm4
+ or r12,r8
+ or r12,r9
+ por xmm2,xmm5
+DB 102,73,15,110,220
+
+ mov rdx,QWORD[384+rsp]
+ lea rbx,[384+rsp]
+ mov r9,QWORD[((0+96))+rsp]
+ mov r10,QWORD[((8+96))+rsp]
+ lea rsi,[((-128+96))+rsp]
+ mov r11,QWORD[((16+96))+rsp]
+ mov r12,QWORD[((24+96))+rsp]
+ lea rdi,[160+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[480+rsp]
+ lea rbx,[480+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[192+rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rbx,[160+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_sub_fromx
+
+ or r12,r13
+ or r12,r8
+ or r12,r9
+
+DB 102,73,15,126,208
+DB 102,73,15,126,217
+ or r12,r8
+ DB 0x3e
+ jnz NEAR $L$add_proceedx
+
+
+
+ test r9,r9
+ jz NEAR $L$add_doublex
+
+
+
+
+
+
+DB 102,72,15,126,199
+ pxor xmm0,xmm0
+ movdqu XMMWORD[rdi],xmm0
+ movdqu XMMWORD[16+rdi],xmm0
+ movdqu XMMWORD[32+rdi],xmm0
+ movdqu XMMWORD[48+rdi],xmm0
+ movdqu XMMWORD[64+rdi],xmm0
+ movdqu XMMWORD[80+rdi],xmm0
+ jmp NEAR $L$add_donex
+
+ALIGN 32
+$L$add_doublex:
+DB 102,72,15,126,206
+DB 102,72,15,126,199
+ add rsp,416
+
+ jmp NEAR $L$point_double_shortcutx
+
+
+ALIGN 32
+$L$add_proceedx:
+ mov rdx,QWORD[((0+64))+rsp]
+ mov r14,QWORD[((8+64))+rsp]
+ lea rsi,[((-128+64))+rsp]
+ mov r15,QWORD[((16+64))+rsp]
+ mov r8,QWORD[((24+64))+rsp]
+ lea rdi,[96+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ mov rdx,QWORD[448+rsp]
+ lea rbx,[448+rsp]
+ mov r9,QWORD[((0+0))+rsp]
+ mov r10,QWORD[((8+0))+rsp]
+ lea rsi,[((-128+0))+rsp]
+ mov r11,QWORD[((16+0))+rsp]
+ mov r12,QWORD[((24+0))+rsp]
+ lea rdi,[352+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[((0+0))+rsp]
+ mov r14,QWORD[((8+0))+rsp]
+ lea rsi,[((-128+0))+rsp]
+ mov r15,QWORD[((16+0))+rsp]
+ mov r8,QWORD[((24+0))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ mov rdx,QWORD[544+rsp]
+ lea rbx,[544+rsp]
+ mov r9,QWORD[((0+352))+rsp]
+ mov r10,QWORD[((8+352))+rsp]
+ lea rsi,[((-128+352))+rsp]
+ mov r11,QWORD[((16+352))+rsp]
+ mov r12,QWORD[((24+352))+rsp]
+ lea rdi,[352+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[rsp]
+ lea rbx,[rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[160+rsp]
+ lea rbx,[160+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[192+rsp]
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xor r11,r11
+ add r12,r12
+ lea rsi,[96+rsp]
+ adc r13,r13
+ mov rax,r12
+ adc r8,r8
+ adc r9,r9
+ mov rbp,r13
+ adc r11,0
+
+ sub r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ mov rax,QWORD[rsi]
+ cmovc r13,rbp
+ mov rbp,QWORD[8+rsi]
+ cmovc r8,rcx
+ mov rcx,QWORD[16+rsi]
+ cmovc r9,r10
+ mov r10,QWORD[24+rsi]
+
+ call __ecp_nistz256_subx
+
+ lea rbx,[128+rsp]
+ lea rdi,[288+rsp]
+ call __ecp_nistz256_sub_fromx
+
+ mov rax,QWORD[((192+0))+rsp]
+ mov rbp,QWORD[((192+8))+rsp]
+ mov rcx,QWORD[((192+16))+rsp]
+ mov r10,QWORD[((192+24))+rsp]
+ lea rdi,[320+rsp]
+
+ call __ecp_nistz256_subx
+
+ mov QWORD[rdi],r12
+ mov QWORD[8+rdi],r13
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+ mov rdx,QWORD[128+rsp]
+ lea rbx,[128+rsp]
+ mov r9,QWORD[((0+224))+rsp]
+ mov r10,QWORD[((8+224))+rsp]
+ lea rsi,[((-128+224))+rsp]
+ mov r11,QWORD[((16+224))+rsp]
+ mov r12,QWORD[((24+224))+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[320+rsp]
+ lea rbx,[320+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((-128+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[320+rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rbx,[256+rsp]
+ lea rdi,[320+rsp]
+ call __ecp_nistz256_sub_fromx
+
+DB 102,72,15,126,199
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[352+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((352+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[544+rsp]
+ pand xmm3,XMMWORD[((544+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[448+rsp]
+ pand xmm3,XMMWORD[((448+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[64+rdi],xmm2
+ movdqu XMMWORD[80+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[288+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((288+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[480+rsp]
+ pand xmm3,XMMWORD[((480+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[384+rsp]
+ pand xmm3,XMMWORD[((384+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[rdi],xmm2
+ movdqu XMMWORD[16+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[320+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((320+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[512+rsp]
+ pand xmm3,XMMWORD[((512+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[416+rsp]
+ pand xmm3,XMMWORD[((416+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu XMMWORD[48+rdi],xmm3
+
+$L$add_donex:
+ lea rsi,[((576+56))+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbx,QWORD[((-16))+rsi]
+
+ mov rbp,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$point_addx_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_point_addx:
+
+ALIGN 32
+ecp_nistz256_point_add_affinex:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_point_add_affinex:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$point_add_affinex:
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,32*15+8
+
+$L$add_affinex_body:
+
+ movdqu xmm0,XMMWORD[rsi]
+ mov rbx,rdx
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm3,XMMWORD[48+rsi]
+ movdqu xmm4,XMMWORD[64+rsi]
+ movdqu xmm5,XMMWORD[80+rsi]
+ mov rdx,QWORD[((64+0))+rsi]
+ mov r14,QWORD[((64+8))+rsi]
+ mov r15,QWORD[((64+16))+rsi]
+ mov r8,QWORD[((64+24))+rsi]
+ movdqa XMMWORD[320+rsp],xmm0
+ movdqa XMMWORD[(320+16)+rsp],xmm1
+ movdqa XMMWORD[352+rsp],xmm2
+ movdqa XMMWORD[(352+16)+rsp],xmm3
+ movdqa XMMWORD[384+rsp],xmm4
+ movdqa XMMWORD[(384+16)+rsp],xmm5
+ por xmm5,xmm4
+
+ movdqu xmm0,XMMWORD[rbx]
+ pshufd xmm3,xmm5,0xb1
+ movdqu xmm1,XMMWORD[16+rbx]
+ movdqu xmm2,XMMWORD[32+rbx]
+ por xmm5,xmm3
+ movdqu xmm3,XMMWORD[48+rbx]
+ movdqa XMMWORD[416+rsp],xmm0
+ pshufd xmm4,xmm5,0x1e
+ movdqa XMMWORD[(416+16)+rsp],xmm1
+ por xmm1,xmm0
+DB 102,72,15,110,199
+ movdqa XMMWORD[448+rsp],xmm2
+ movdqa XMMWORD[(448+16)+rsp],xmm3
+ por xmm3,xmm2
+ por xmm5,xmm4
+ pxor xmm4,xmm4
+ por xmm3,xmm1
+
+ lea rsi,[((64-128))+rsi]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd xmm5,xmm4
+ pshufd xmm4,xmm3,0xb1
+ mov rdx,QWORD[rbx]
+
+ mov r9,r12
+ por xmm4,xmm3
+ pshufd xmm5,xmm5,0
+ pshufd xmm3,xmm4,0x1e
+ mov r10,r13
+ por xmm4,xmm3
+ pxor xmm3,xmm3
+ mov r11,r14
+ pcmpeqd xmm4,xmm3
+ pshufd xmm4,xmm4,0
+
+ lea rsi,[((32-128))+rsp]
+ mov r12,r15
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rbx,[320+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_sub_fromx
+
+ mov rdx,QWORD[384+rsp]
+ lea rbx,[384+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[384+rsp]
+ lea rbx,[384+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((-128+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[288+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[448+rsp]
+ lea rbx,[448+rsp]
+ mov r9,QWORD[((0+32))+rsp]
+ mov r10,QWORD[((8+32))+rsp]
+ lea rsi,[((-128+32))+rsp]
+ mov r11,QWORD[((16+32))+rsp]
+ mov r12,QWORD[((24+32))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rbx,[352+rsp]
+ lea rdi,[96+rsp]
+ call __ecp_nistz256_sub_fromx
+
+ mov rdx,QWORD[((0+64))+rsp]
+ mov r14,QWORD[((8+64))+rsp]
+ lea rsi,[((-128+64))+rsp]
+ mov r15,QWORD[((16+64))+rsp]
+ mov r8,QWORD[((24+64))+rsp]
+ lea rdi,[128+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ mov rdx,QWORD[((0+96))+rsp]
+ mov r14,QWORD[((8+96))+rsp]
+ lea rsi,[((-128+96))+rsp]
+ mov r15,QWORD[((16+96))+rsp]
+ mov r8,QWORD[((24+96))+rsp]
+ lea rdi,[192+rsp]
+ call __ecp_nistz256_sqr_montx
+
+ mov rdx,QWORD[128+rsp]
+ lea rbx,[128+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((-128+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[160+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[320+rsp]
+ lea rbx,[320+rsp]
+ mov r9,QWORD[((0+128))+rsp]
+ mov r10,QWORD[((8+128))+rsp]
+ lea rsi,[((-128+128))+rsp]
+ mov r11,QWORD[((16+128))+rsp]
+ mov r12,QWORD[((24+128))+rsp]
+ lea rdi,[rsp]
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xor r11,r11
+ add r12,r12
+ lea rsi,[192+rsp]
+ adc r13,r13
+ mov rax,r12
+ adc r8,r8
+ adc r9,r9
+ mov rbp,r13
+ adc r11,0
+
+ sub r12,-1
+ mov rcx,r8
+ sbb r13,r14
+ sbb r8,0
+ mov r10,r9
+ sbb r9,r15
+ sbb r11,0
+
+ cmovc r12,rax
+ mov rax,QWORD[rsi]
+ cmovc r13,rbp
+ mov rbp,QWORD[8+rsi]
+ cmovc r8,rcx
+ mov rcx,QWORD[16+rsi]
+ cmovc r9,r10
+ mov r10,QWORD[24+rsi]
+
+ call __ecp_nistz256_subx
+
+ lea rbx,[160+rsp]
+ lea rdi,[224+rsp]
+ call __ecp_nistz256_sub_fromx
+
+ mov rax,QWORD[((0+0))+rsp]
+ mov rbp,QWORD[((0+8))+rsp]
+ mov rcx,QWORD[((0+16))+rsp]
+ mov r10,QWORD[((0+24))+rsp]
+ lea rdi,[64+rsp]
+
+ call __ecp_nistz256_subx
+
+ mov QWORD[rdi],r12
+ mov QWORD[8+rdi],r13
+ mov QWORD[16+rdi],r8
+ mov QWORD[24+rdi],r9
+ mov rdx,QWORD[352+rsp]
+ lea rbx,[352+rsp]
+ mov r9,QWORD[((0+160))+rsp]
+ mov r10,QWORD[((8+160))+rsp]
+ lea rsi,[((-128+160))+rsp]
+ mov r11,QWORD[((16+160))+rsp]
+ mov r12,QWORD[((24+160))+rsp]
+ lea rdi,[32+rsp]
+ call __ecp_nistz256_mul_montx
+
+ mov rdx,QWORD[96+rsp]
+ lea rbx,[96+rsp]
+ mov r9,QWORD[((0+64))+rsp]
+ mov r10,QWORD[((8+64))+rsp]
+ lea rsi,[((-128+64))+rsp]
+ mov r11,QWORD[((16+64))+rsp]
+ mov r12,QWORD[((24+64))+rsp]
+ lea rdi,[64+rsp]
+ call __ecp_nistz256_mul_montx
+
+ lea rbx,[32+rsp]
+ lea rdi,[256+rsp]
+ call __ecp_nistz256_sub_fromx
+
+DB 102,72,15,126,199
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[288+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((288+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[$L$ONE_mont]
+ pand xmm3,XMMWORD[(($L$ONE_mont+16))]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[384+rsp]
+ pand xmm3,XMMWORD[((384+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[64+rdi],xmm2
+ movdqu XMMWORD[80+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[224+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((224+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[416+rsp]
+ pand xmm3,XMMWORD[((416+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[320+rsp]
+ pand xmm3,XMMWORD[((320+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[rdi],xmm2
+ movdqu XMMWORD[16+rdi],xmm3
+
+ movdqa xmm0,xmm5
+ movdqa xmm1,xmm5
+ pandn xmm0,XMMWORD[256+rsp]
+ movdqa xmm2,xmm5
+ pandn xmm1,XMMWORD[((256+16))+rsp]
+ movdqa xmm3,xmm5
+ pand xmm2,XMMWORD[448+rsp]
+ pand xmm3,XMMWORD[((448+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm4
+ pandn xmm0,xmm2
+ movdqa xmm2,xmm4
+ pandn xmm1,xmm3
+ movdqa xmm3,xmm4
+ pand xmm2,XMMWORD[352+rsp]
+ pand xmm3,XMMWORD[((352+16))+rsp]
+ por xmm2,xmm0
+ por xmm3,xmm1
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu XMMWORD[48+rdi],xmm3
+
+ lea rsi,[((480+56))+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbx,QWORD[((-16))+rsi]
+
+ mov rbp,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$add_affinex_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ecp_nistz256_point_add_affinex:
+EXTERN __imp_RtlVirtualUnwind
+
+
+ALIGN 16
+short_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rax,[16+rax]
+
+ mov r12,QWORD[((-8))+rax]
+ mov r13,QWORD[((-16))+rax]
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+
+ jmp NEAR $L$common_seh_tail
+
+
+
+ALIGN 16
+full_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov r10d,DWORD[8+r11]
+ lea rax,[r10*1+rax]
+
+ mov rbp,QWORD[((-8))+rax]
+ mov rbx,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_ecp_nistz256_neg wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_neg wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_neg wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_ord_mul_mont wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_ord_mul_mont wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_ord_mul_mont wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_ord_sqr_mont wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_ord_sqr_mont wrt ..imagebase
+ DD $L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase
+ DD $L$SEH_begin_ecp_nistz256_mul_mont wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_mul_mont wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_mul_mont wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_sqr_mont wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_sqr_mont wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_sqr_mont wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_select_w5 wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_select_w5 wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_select_w7 wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_select_w7 wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_select_wX wrt ..imagebase
+ DD $L$SEH_begin_ecp_nistz256_avx2_select_w5 wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_avx2_select_w5 wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_avx2_select_w7 wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_avx2_select_w7 wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_avx2_select_wX wrt ..imagebase
+ DD $L$SEH_begin_ecp_nistz256_point_double wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_point_double wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_point_double wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_point_add wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_point_add wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_point_add wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_point_add_affine wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_point_add_affine wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_point_add_affine wrt ..imagebase
+ DD $L$SEH_begin_ecp_nistz256_point_doublex wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_point_doublex wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_point_doublex wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_point_addx wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_point_addx wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_point_addx wrt ..imagebase
+
+ DD $L$SEH_begin_ecp_nistz256_point_add_affinex wrt ..imagebase
+ DD $L$SEH_end_ecp_nistz256_point_add_affinex wrt ..imagebase
+ DD $L$SEH_info_ecp_nistz256_point_add_affinex wrt ..imagebase
+
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_ecp_nistz256_neg:
+ DB 9,0,0,0
+ DD short_handler wrt ..imagebase
+ DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase
+$L$SEH_info_ecp_nistz256_ord_mul_mont:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase
+ DD 48,0
+$L$SEH_info_ecp_nistz256_ord_sqr_mont:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase
+ DD 48,0
+$L$SEH_info_ecp_nistz256_ord_mul_montx:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase
+ DD 48,0
+$L$SEH_info_ecp_nistz256_ord_sqr_montx:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase
+ DD 48,0
+$L$SEH_info_ecp_nistz256_mul_mont:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+ DD 48,0
+$L$SEH_info_ecp_nistz256_sqr_mont:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase
+ DD 48,0
+$L$SEH_info_ecp_nistz256_select_wX:
+ DB 0x01,0x33,0x16,0x00
+ DB 0x33,0xf8,0x09,0x00
+ DB 0x2e,0xe8,0x08,0x00
+ DB 0x29,0xd8,0x07,0x00
+ DB 0x24,0xc8,0x06,0x00
+ DB 0x1f,0xb8,0x05,0x00
+ DB 0x1a,0xa8,0x04,0x00
+ DB 0x15,0x98,0x03,0x00
+ DB 0x10,0x88,0x02,0x00
+ DB 0x0c,0x78,0x01,0x00
+ DB 0x08,0x68,0x00,0x00
+ DB 0x04,0x01,0x15,0x00
+ALIGN 8
+$L$SEH_info_ecp_nistz256_avx2_select_wX:
+ DB 0x01,0x36,0x17,0x0b
+ DB 0x36,0xf8,0x09,0x00
+ DB 0x31,0xe8,0x08,0x00
+ DB 0x2c,0xd8,0x07,0x00
+ DB 0x27,0xc8,0x06,0x00
+ DB 0x22,0xb8,0x05,0x00
+ DB 0x1d,0xa8,0x04,0x00
+ DB 0x18,0x98,0x03,0x00
+ DB 0x13,0x88,0x02,0x00
+ DB 0x0e,0x78,0x01,0x00
+ DB 0x09,0x68,0x00,0x00
+ DB 0x04,0x01,0x15,0x00
+ DB 0x00,0xb3,0x00,0x00
+ALIGN 8
+$L$SEH_info_ecp_nistz256_point_double:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase
+ DD 32*5+56,0
+$L$SEH_info_ecp_nistz256_point_add:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase
+ DD 32*18+56,0
+$L$SEH_info_ecp_nistz256_point_add_affine:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase
+ DD 32*15+56,0
+ALIGN 8
+$L$SEH_info_ecp_nistz256_point_doublex:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase
+ DD 32*5+56,0
+$L$SEH_info_ecp_nistz256_point_addx:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase
+ DD 32*18+56,0
+$L$SEH_info_ecp_nistz256_point_add_affinex:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase
+ DD 32*15+56,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/p256_beeu-armv8-asm-apple.S b/gen/bcm/p256_beeu-armv8-asm-apple.S
new file mode 100644
index 0000000..49ea9b8
--- /dev/null
+++ b/gen/bcm/p256_beeu-armv8-asm-apple.S
@@ -0,0 +1,309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include "openssl/arm_arch.h"
+
+.text
+.globl _beeu_mod_inverse_vartime
+.private_extern _beeu_mod_inverse_vartime
+
+.align 4
+_beeu_mod_inverse_vartime:
+ // Reserve enough space for 14 8-byte registers on the stack
+ // in the first stp call for x29, x30.
+ // Then store the remaining callee-saved registers.
+ //
+ // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
+ // ^ ^
+ // sp <------------------- 112 bytes ----------------> old sp
+ // x29 (FP)
+ //
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-112]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x2,[sp,#96]
+
+ // B = b3..b0 := a
+ ldp x25,x26,[x1]
+ ldp x27,x28,[x1,#16]
+
+ // n3..n0 := n
+ // Note: the value of input params are changed in the following.
+ ldp x0,x1,[x2]
+ ldp x2,x30,[x2,#16]
+
+ // A = a3..a0 := n
+ mov x21, x0
+ mov x22, x1
+ mov x23, x2
+ mov x24, x30
+
+ // X = x4..x0 := 1
+ mov x3, #1
+ eor x4, x4, x4
+ eor x5, x5, x5
+ eor x6, x6, x6
+ eor x7, x7, x7
+
+ // Y = y4..y0 := 0
+ eor x8, x8, x8
+ eor x9, x9, x9
+ eor x10, x10, x10
+ eor x11, x11, x11
+ eor x12, x12, x12
+
+Lbeeu_loop:
+ // if B == 0, jump to .Lbeeu_loop_end
+ orr x14, x25, x26
+ orr x14, x14, x27
+
+ // reverse the bit order of x25. This is needed for clz after this macro
+ rbit x15, x25
+
+ orr x14, x14, x28
+ cbz x14,Lbeeu_loop_end
+
+
+ // 0 < B < |n|,
+ // 0 < A <= |n|,
+ // (1) X*a == B (mod |n|),
+ // (2) (-1)*Y*a == A (mod |n|)
+
+ // Now divide B by the maximum possible power of two in the
+ // integers, and divide X by the same value mod |n|.
+ // When we're done, (1) still holds.
+
+ // shift := number of trailing 0s in x25
+ // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+ clz x13, x15
+
+ // If there is no shift, goto shift_A_Y
+ cbz x13, Lbeeu_shift_A_Y
+
+ // Shift B right by "x13" bits
+ neg x14, x13
+ lsr x25, x25, x13
+ lsl x15, x26, x14
+
+ lsr x26, x26, x13
+ lsl x19, x27, x14
+
+ orr x25, x25, x15
+
+ lsr x27, x27, x13
+ lsl x20, x28, x14
+
+ orr x26, x26, x19
+
+ lsr x28, x28, x13
+
+ orr x27, x27, x20
+
+
+ // Shift X right by "x13" bits, adding n whenever X becomes odd.
+ // x13--;
+ // x14 := 0; needed in the addition to the most significant word in SHIFT1
+ eor x14, x14, x14
+Lbeeu_shift_loop_X:
+ tbz x3, #0, Lshift1_0
+ adds x3, x3, x0
+ adcs x4, x4, x1
+ adcs x5, x5, x2
+ adcs x6, x6, x30
+ adc x7, x7, x14
+Lshift1_0:
+ // var0 := [var1|var0]<64..1>;
+ // i.e. concatenate var1 and var0,
+ // extract bits <64..1> from the resulting 128-bit value
+ // and put them in var0
+ extr x3, x4, x3, #1
+ extr x4, x5, x4, #1
+ extr x5, x6, x5, #1
+ extr x6, x7, x6, #1
+ lsr x7, x7, #1
+
+ subs x13, x13, #1
+ bne Lbeeu_shift_loop_X
+
+ // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+ // with the following differences:
+ // - "x13" is set directly to the number of trailing 0s in B
+ // (using rbit and clz instructions)
+ // - The loop is only used to call SHIFT1(X)
+ // and x13 is decreased while executing the X loop.
+ // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+Lbeeu_shift_A_Y:
+ // Same for A and Y.
+ // Afterwards, (2) still holds.
+ // Reverse the bit order of x21
+ // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+ rbit x15, x21
+ clz x13, x15
+
+ // If there is no shift, goto |B-A|, X+Y update
+ cbz x13, Lbeeu_update_B_X_or_A_Y
+
+ // Shift A right by "x13" bits
+ neg x14, x13
+ lsr x21, x21, x13
+ lsl x15, x22, x14
+
+ lsr x22, x22, x13
+ lsl x19, x23, x14
+
+ orr x21, x21, x15
+
+ lsr x23, x23, x13
+ lsl x20, x24, x14
+
+ orr x22, x22, x19
+
+ lsr x24, x24, x13
+
+ orr x23, x23, x20
+
+
+ // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+ // x13--;
+ // x14 := 0; needed in the addition to the most significant word in SHIFT1
+ eor x14, x14, x14
+Lbeeu_shift_loop_Y:
+ tbz x8, #0, Lshift1_1
+ adds x8, x8, x0
+ adcs x9, x9, x1
+ adcs x10, x10, x2
+ adcs x11, x11, x30
+ adc x12, x12, x14
+Lshift1_1:
+ // var0 := [var1|var0]<64..1>;
+ // i.e. concatenate var1 and var0,
+ // extract bits <64..1> from the resulting 128-bit value
+ // and put them in var0
+ extr x8, x9, x8, #1
+ extr x9, x10, x9, #1
+ extr x10, x11, x10, #1
+ extr x11, x12, x11, #1
+ lsr x12, x12, #1
+
+ subs x13, x13, #1
+ bne Lbeeu_shift_loop_Y
+
+Lbeeu_update_B_X_or_A_Y:
+ // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+ // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+ // without taking a sign bit if generated. The lack of a carry would
+ // indicate a negative result. See, for example,
+ // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+ subs x14, x25, x21
+ sbcs x15, x26, x22
+ sbcs x19, x27, x23
+ sbcs x20, x28, x24
+ bcs Lbeeu_B_greater_than_A
+
+ // Else A > B =>
+ // A := A - B; Y := Y + X; goto beginning of the loop
+ subs x21, x21, x25
+ sbcs x22, x22, x26
+ sbcs x23, x23, x27
+ sbcs x24, x24, x28
+
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adcs x10, x10, x5
+ adcs x11, x11, x6
+ adc x12, x12, x7
+ b Lbeeu_loop
+
+Lbeeu_B_greater_than_A:
+ // Continue with B > A =>
+ // B := B - A; X := X + Y; goto beginning of the loop
+ mov x25, x14
+ mov x26, x15
+ mov x27, x19
+ mov x28, x20
+
+ adds x3, x3, x8
+ adcs x4, x4, x9
+ adcs x5, x5, x10
+ adcs x6, x6, x11
+ adc x7, x7, x12
+ b Lbeeu_loop
+
+Lbeeu_loop_end:
+ // The Euclid's algorithm loop ends when A == gcd(a,n);
+ // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+ // Since (-1)*Y*a == A (mod |n|), Y>0
+ // then out = -Y mod n
+
+ // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
+ // Is A-1 == 0?
+ // If not, fail.
+ sub x14, x21, #1
+ orr x14, x14, x22
+ orr x14, x14, x23
+ orr x14, x14, x24
+ cbnz x14, Lbeeu_err
+
+ // If Y>n ==> Y:=Y-n
+Lbeeu_reduction_loop:
+ // x_i := y_i - n_i (X is no longer needed, use it as temp)
+ // (x14 = 0 from above)
+ subs x3, x8, x0
+ sbcs x4, x9, x1
+ sbcs x5, x10, x2
+ sbcs x6, x11, x30
+ sbcs x7, x12, x14
+
+ // If result is non-negative (i.e., cs = carry set = no borrow),
+ // y_i := x_i; goto reduce again
+ // else
+ // y_i := y_i; continue
+ csel x8, x3, x8, cs
+ csel x9, x4, x9, cs
+ csel x10, x5, x10, cs
+ csel x11, x6, x11, cs
+ csel x12, x7, x12, cs
+ bcs Lbeeu_reduction_loop
+
+ // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+ // out = -Y = n-Y
+ subs x8, x0, x8
+ sbcs x9, x1, x9
+ sbcs x10, x2, x10
+ sbcs x11, x30, x11
+
+ // Save Y in output (out (x0) was saved on the stack)
+ ldr x3, [sp,#96]
+ stp x8, x9, [x3]
+ stp x10, x11, [x3,#16]
+ // return 1 (success)
+ mov x0, #1
+ b Lbeeu_finish
+
+Lbeeu_err:
+ // return 0 (error)
+ eor x0, x0, x0
+
+Lbeeu_finish:
+ // Restore callee-saved registers, except x0, x2
+ add sp,x29,#0
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldp x29,x30,[sp],#112
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/p256_beeu-armv8-asm-linux.S b/gen/bcm/p256_beeu-armv8-asm-linux.S
new file mode 100644
index 0000000..8e09b61
--- /dev/null
+++ b/gen/bcm/p256_beeu-armv8-asm-linux.S
@@ -0,0 +1,309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include "openssl/arm_arch.h"
+
+.text
+.globl beeu_mod_inverse_vartime
+.hidden beeu_mod_inverse_vartime
+.type beeu_mod_inverse_vartime, %function
+.align 4
+beeu_mod_inverse_vartime:
+ // Reserve enough space for 14 8-byte registers on the stack
+ // in the first stp call for x29, x30.
+ // Then store the remaining callee-saved registers.
+ //
+ // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
+ // ^ ^
+ // sp <------------------- 112 bytes ----------------> old sp
+ // x29 (FP)
+ //
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-112]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x2,[sp,#96]
+
+ // B = b3..b0 := a
+ ldp x25,x26,[x1]
+ ldp x27,x28,[x1,#16]
+
+ // n3..n0 := n
+ // Note: the value of input params are changed in the following.
+ ldp x0,x1,[x2]
+ ldp x2,x30,[x2,#16]
+
+ // A = a3..a0 := n
+ mov x21, x0
+ mov x22, x1
+ mov x23, x2
+ mov x24, x30
+
+ // X = x4..x0 := 1
+ mov x3, #1
+ eor x4, x4, x4
+ eor x5, x5, x5
+ eor x6, x6, x6
+ eor x7, x7, x7
+
+ // Y = y4..y0 := 0
+ eor x8, x8, x8
+ eor x9, x9, x9
+ eor x10, x10, x10
+ eor x11, x11, x11
+ eor x12, x12, x12
+
+.Lbeeu_loop:
+ // if B == 0, jump to .Lbeeu_loop_end
+ orr x14, x25, x26
+ orr x14, x14, x27
+
+ // reverse the bit order of x25. This is needed for clz after this macro
+ rbit x15, x25
+
+ orr x14, x14, x28
+ cbz x14,.Lbeeu_loop_end
+
+
+ // 0 < B < |n|,
+ // 0 < A <= |n|,
+ // (1) X*a == B (mod |n|),
+ // (2) (-1)*Y*a == A (mod |n|)
+
+ // Now divide B by the maximum possible power of two in the
+ // integers, and divide X by the same value mod |n|.
+ // When we're done, (1) still holds.
+
+ // shift := number of trailing 0s in x25
+ // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+ clz x13, x15
+
+ // If there is no shift, goto shift_A_Y
+ cbz x13, .Lbeeu_shift_A_Y
+
+ // Shift B right by "x13" bits
+ neg x14, x13
+ lsr x25, x25, x13
+ lsl x15, x26, x14
+
+ lsr x26, x26, x13
+ lsl x19, x27, x14
+
+ orr x25, x25, x15
+
+ lsr x27, x27, x13
+ lsl x20, x28, x14
+
+ orr x26, x26, x19
+
+ lsr x28, x28, x13
+
+ orr x27, x27, x20
+
+
+ // Shift X right by "x13" bits, adding n whenever X becomes odd.
+ // x13--;
+ // x14 := 0; needed in the addition to the most significant word in SHIFT1
+ eor x14, x14, x14
+.Lbeeu_shift_loop_X:
+ tbz x3, #0, .Lshift1_0
+ adds x3, x3, x0
+ adcs x4, x4, x1
+ adcs x5, x5, x2
+ adcs x6, x6, x30
+ adc x7, x7, x14
+.Lshift1_0:
+ // var0 := [var1|var0]<64..1>;
+ // i.e. concatenate var1 and var0,
+ // extract bits <64..1> from the resulting 128-bit value
+ // and put them in var0
+ extr x3, x4, x3, #1
+ extr x4, x5, x4, #1
+ extr x5, x6, x5, #1
+ extr x6, x7, x6, #1
+ lsr x7, x7, #1
+
+ subs x13, x13, #1
+ bne .Lbeeu_shift_loop_X
+
+ // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+ // with the following differences:
+ // - "x13" is set directly to the number of trailing 0s in B
+ // (using rbit and clz instructions)
+ // - The loop is only used to call SHIFT1(X)
+ // and x13 is decreased while executing the X loop.
+ // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+.Lbeeu_shift_A_Y:
+ // Same for A and Y.
+ // Afterwards, (2) still holds.
+ // Reverse the bit order of x21
+ // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+ rbit x15, x21
+ clz x13, x15
+
+ // If there is no shift, goto |B-A|, X+Y update
+ cbz x13, .Lbeeu_update_B_X_or_A_Y
+
+ // Shift A right by "x13" bits
+ neg x14, x13
+ lsr x21, x21, x13
+ lsl x15, x22, x14
+
+ lsr x22, x22, x13
+ lsl x19, x23, x14
+
+ orr x21, x21, x15
+
+ lsr x23, x23, x13
+ lsl x20, x24, x14
+
+ orr x22, x22, x19
+
+ lsr x24, x24, x13
+
+ orr x23, x23, x20
+
+
+ // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+ // x13--;
+ // x14 := 0; needed in the addition to the most significant word in SHIFT1
+ eor x14, x14, x14
+.Lbeeu_shift_loop_Y:
+ tbz x8, #0, .Lshift1_1
+ adds x8, x8, x0
+ adcs x9, x9, x1
+ adcs x10, x10, x2
+ adcs x11, x11, x30
+ adc x12, x12, x14
+.Lshift1_1:
+ // var0 := [var1|var0]<64..1>;
+ // i.e. concatenate var1 and var0,
+ // extract bits <64..1> from the resulting 128-bit value
+ // and put them in var0
+ extr x8, x9, x8, #1
+ extr x9, x10, x9, #1
+ extr x10, x11, x10, #1
+ extr x11, x12, x11, #1
+ lsr x12, x12, #1
+
+ subs x13, x13, #1
+ bne .Lbeeu_shift_loop_Y
+
+.Lbeeu_update_B_X_or_A_Y:
+ // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+ // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+ // without taking a sign bit if generated. The lack of a carry would
+ // indicate a negative result. See, for example,
+ // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+ subs x14, x25, x21
+ sbcs x15, x26, x22
+ sbcs x19, x27, x23
+ sbcs x20, x28, x24
+ bcs .Lbeeu_B_greater_than_A
+
+ // Else A > B =>
+ // A := A - B; Y := Y + X; goto beginning of the loop
+ subs x21, x21, x25
+ sbcs x22, x22, x26
+ sbcs x23, x23, x27
+ sbcs x24, x24, x28
+
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adcs x10, x10, x5
+ adcs x11, x11, x6
+ adc x12, x12, x7
+ b .Lbeeu_loop
+
+.Lbeeu_B_greater_than_A:
+ // Continue with B > A =>
+ // B := B - A; X := X + Y; goto beginning of the loop
+ mov x25, x14
+ mov x26, x15
+ mov x27, x19
+ mov x28, x20
+
+ adds x3, x3, x8
+ adcs x4, x4, x9
+ adcs x5, x5, x10
+ adcs x6, x6, x11
+ adc x7, x7, x12
+ b .Lbeeu_loop
+
+.Lbeeu_loop_end:
+ // The Euclid's algorithm loop ends when A == gcd(a,n);
+ // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+ // Since (-1)*Y*a == A (mod |n|), Y>0
+ // then out = -Y mod n
+
+ // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
+ // Is A-1 == 0?
+ // If not, fail.
+ sub x14, x21, #1
+ orr x14, x14, x22
+ orr x14, x14, x23
+ orr x14, x14, x24
+ cbnz x14, .Lbeeu_err
+
+ // If Y>n ==> Y:=Y-n
+.Lbeeu_reduction_loop:
+ // x_i := y_i - n_i (X is no longer needed, use it as temp)
+ // (x14 = 0 from above)
+ subs x3, x8, x0
+ sbcs x4, x9, x1
+ sbcs x5, x10, x2
+ sbcs x6, x11, x30
+ sbcs x7, x12, x14
+
+ // If result is non-negative (i.e., cs = carry set = no borrow),
+ // y_i := x_i; goto reduce again
+ // else
+ // y_i := y_i; continue
+ csel x8, x3, x8, cs
+ csel x9, x4, x9, cs
+ csel x10, x5, x10, cs
+ csel x11, x6, x11, cs
+ csel x12, x7, x12, cs
+ bcs .Lbeeu_reduction_loop
+
+ // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+ // out = -Y = n-Y
+ subs x8, x0, x8
+ sbcs x9, x1, x9
+ sbcs x10, x2, x10
+ sbcs x11, x30, x11
+
+ // Save Y in output (out (x0) was saved on the stack)
+ ldr x3, [sp,#96]
+ stp x8, x9, [x3]
+ stp x10, x11, [x3,#16]
+ // return 1 (success)
+ mov x0, #1
+ b .Lbeeu_finish
+
+.Lbeeu_err:
+ // return 0 (error)
+ eor x0, x0, x0
+
+.Lbeeu_finish:
+ // Restore callee-saved registers, except x0, x2
+ add sp,x29,#0
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldp x29,x30,[sp],#112
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/p256_beeu-armv8-asm-win.S b/gen/bcm/p256_beeu-armv8-asm-win.S
new file mode 100644
index 0000000..ac6eb17
--- /dev/null
+++ b/gen/bcm/p256_beeu-armv8-asm-win.S
@@ -0,0 +1,309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include "openssl/arm_arch.h"
+
+.text
+.globl beeu_mod_inverse_vartime
+
+
+.align 4
+beeu_mod_inverse_vartime:
+ // Reserve enough space for 14 8-byte registers on the stack
+ // in the first stp call for x29, x30.
+ // Then store the remaining callee-saved registers.
+ //
+ // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
+ // ^ ^
+ // sp <------------------- 112 bytes ----------------> old sp
+ // x29 (FP)
+ //
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-112]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x2,[sp,#96]
+
+ // B = b3..b0 := a
+ ldp x25,x26,[x1]
+ ldp x27,x28,[x1,#16]
+
+ // n3..n0 := n
+ // Note: the value of input params are changed in the following.
+ ldp x0,x1,[x2]
+ ldp x2,x30,[x2,#16]
+
+ // A = a3..a0 := n
+ mov x21, x0
+ mov x22, x1
+ mov x23, x2
+ mov x24, x30
+
+ // X = x4..x0 := 1
+ mov x3, #1
+ eor x4, x4, x4
+ eor x5, x5, x5
+ eor x6, x6, x6
+ eor x7, x7, x7
+
+ // Y = y4..y0 := 0
+ eor x8, x8, x8
+ eor x9, x9, x9
+ eor x10, x10, x10
+ eor x11, x11, x11
+ eor x12, x12, x12
+
+Lbeeu_loop:
+ // if B == 0, jump to .Lbeeu_loop_end
+ orr x14, x25, x26
+ orr x14, x14, x27
+
+ // reverse the bit order of x25. This is needed for clz after this macro
+ rbit x15, x25
+
+ orr x14, x14, x28
+ cbz x14,Lbeeu_loop_end
+
+
+ // 0 < B < |n|,
+ // 0 < A <= |n|,
+ // (1) X*a == B (mod |n|),
+ // (2) (-1)*Y*a == A (mod |n|)
+
+ // Now divide B by the maximum possible power of two in the
+ // integers, and divide X by the same value mod |n|.
+ // When we're done, (1) still holds.
+
+ // shift := number of trailing 0s in x25
+ // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+ clz x13, x15
+
+ // If there is no shift, goto shift_A_Y
+ cbz x13, Lbeeu_shift_A_Y
+
+ // Shift B right by "x13" bits
+ neg x14, x13
+ lsr x25, x25, x13
+ lsl x15, x26, x14
+
+ lsr x26, x26, x13
+ lsl x19, x27, x14
+
+ orr x25, x25, x15
+
+ lsr x27, x27, x13
+ lsl x20, x28, x14
+
+ orr x26, x26, x19
+
+ lsr x28, x28, x13
+
+ orr x27, x27, x20
+
+
+ // Shift X right by "x13" bits, adding n whenever X becomes odd.
+ // x13--;
+ // x14 := 0; needed in the addition to the most significant word in SHIFT1
+ eor x14, x14, x14
+Lbeeu_shift_loop_X:
+ tbz x3, #0, Lshift1_0
+ adds x3, x3, x0
+ adcs x4, x4, x1
+ adcs x5, x5, x2
+ adcs x6, x6, x30
+ adc x7, x7, x14
+Lshift1_0:
+ // var0 := [var1|var0]<64..1>;
+ // i.e. concatenate var1 and var0,
+ // extract bits <64..1> from the resulting 128-bit value
+ // and put them in var0
+ extr x3, x4, x3, #1
+ extr x4, x5, x4, #1
+ extr x5, x6, x5, #1
+ extr x6, x7, x6, #1
+ lsr x7, x7, #1
+
+ subs x13, x13, #1
+ bne Lbeeu_shift_loop_X
+
+ // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+ // with the following differences:
+ // - "x13" is set directly to the number of trailing 0s in B
+ // (using rbit and clz instructions)
+ // - The loop is only used to call SHIFT1(X)
+ // and x13 is decreased while executing the X loop.
+ // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+Lbeeu_shift_A_Y:
+ // Same for A and Y.
+ // Afterwards, (2) still holds.
+ // Reverse the bit order of x21
+ // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+ rbit x15, x21
+ clz x13, x15
+
+ // If there is no shift, goto |B-A|, X+Y update
+ cbz x13, Lbeeu_update_B_X_or_A_Y
+
+ // Shift A right by "x13" bits
+ neg x14, x13
+ lsr x21, x21, x13
+ lsl x15, x22, x14
+
+ lsr x22, x22, x13
+ lsl x19, x23, x14
+
+ orr x21, x21, x15
+
+ lsr x23, x23, x13
+ lsl x20, x24, x14
+
+ orr x22, x22, x19
+
+ lsr x24, x24, x13
+
+ orr x23, x23, x20
+
+
+ // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+ // x13--;
+ // x14 := 0; needed in the addition to the most significant word in SHIFT1
+ eor x14, x14, x14
+Lbeeu_shift_loop_Y:
+ tbz x8, #0, Lshift1_1
+ adds x8, x8, x0
+ adcs x9, x9, x1
+ adcs x10, x10, x2
+ adcs x11, x11, x30
+ adc x12, x12, x14
+Lshift1_1:
+ // var0 := [var1|var0]<64..1>;
+ // i.e. concatenate var1 and var0,
+ // extract bits <64..1> from the resulting 128-bit value
+ // and put them in var0
+ extr x8, x9, x8, #1
+ extr x9, x10, x9, #1
+ extr x10, x11, x10, #1
+ extr x11, x12, x11, #1
+ lsr x12, x12, #1
+
+ subs x13, x13, #1
+ bne Lbeeu_shift_loop_Y
+
+Lbeeu_update_B_X_or_A_Y:
+ // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+ // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+ // without taking a sign bit if generated. The lack of a carry would
+ // indicate a negative result. See, for example,
+ // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+ subs x14, x25, x21
+ sbcs x15, x26, x22
+ sbcs x19, x27, x23
+ sbcs x20, x28, x24
+ bcs Lbeeu_B_greater_than_A
+
+ // Else A > B =>
+ // A := A - B; Y := Y + X; goto beginning of the loop
+ subs x21, x21, x25
+ sbcs x22, x22, x26
+ sbcs x23, x23, x27
+ sbcs x24, x24, x28
+
+ adds x8, x8, x3
+ adcs x9, x9, x4
+ adcs x10, x10, x5
+ adcs x11, x11, x6
+ adc x12, x12, x7
+ b Lbeeu_loop
+
+Lbeeu_B_greater_than_A:
+ // Continue with B > A =>
+ // B := B - A; X := X + Y; goto beginning of the loop
+ mov x25, x14
+ mov x26, x15
+ mov x27, x19
+ mov x28, x20
+
+ adds x3, x3, x8
+ adcs x4, x4, x9
+ adcs x5, x5, x10
+ adcs x6, x6, x11
+ adc x7, x7, x12
+ b Lbeeu_loop
+
+Lbeeu_loop_end:
+ // The Euclid's algorithm loop ends when A == gcd(a,n);
+ // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+ // Since (-1)*Y*a == A (mod |n|), Y>0
+ // then out = -Y mod n
+
+ // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
+ // Is A-1 == 0?
+ // If not, fail.
+ sub x14, x21, #1
+ orr x14, x14, x22
+ orr x14, x14, x23
+ orr x14, x14, x24
+ cbnz x14, Lbeeu_err
+
+ // If Y>n ==> Y:=Y-n
+Lbeeu_reduction_loop:
+ // x_i := y_i - n_i (X is no longer needed, use it as temp)
+ // (x14 = 0 from above)
+ subs x3, x8, x0
+ sbcs x4, x9, x1
+ sbcs x5, x10, x2
+ sbcs x6, x11, x30
+ sbcs x7, x12, x14
+
+ // If result is non-negative (i.e., cs = carry set = no borrow),
+ // y_i := x_i; goto reduce again
+ // else
+ // y_i := y_i; continue
+ csel x8, x3, x8, cs
+ csel x9, x4, x9, cs
+ csel x10, x5, x10, cs
+ csel x11, x6, x11, cs
+ csel x12, x7, x12, cs
+ bcs Lbeeu_reduction_loop
+
+ // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+ // out = -Y = n-Y
+ subs x8, x0, x8
+ sbcs x9, x1, x9
+ sbcs x10, x2, x10
+ sbcs x11, x30, x11
+
+ // Save Y in output (out (x0) was saved on the stack)
+ ldr x3, [sp,#96]
+ stp x8, x9, [x3]
+ stp x10, x11, [x3,#16]
+ // return 1 (success)
+ mov x0, #1
+ b Lbeeu_finish
+
+Lbeeu_err:
+ // return 0 (error)
+ eor x0, x0, x0
+
+Lbeeu_finish:
+ // Restore callee-saved registers, except x0, x2
+ add sp,x29,#0
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldp x29,x30,[sp],#112
+
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/p256_beeu-x86_64-asm-apple.S b/gen/bcm/p256_beeu-x86_64-asm-apple.S
new file mode 100644
index 0000000..fc6552c
--- /dev/null
+++ b/gen/bcm/p256_beeu-x86_64-asm-apple.S
@@ -0,0 +1,322 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+.private_extern _beeu_mod_inverse_vartime
+.globl _beeu_mod_inverse_vartime
+.private_extern _beeu_mod_inverse_vartime
+.p2align 5
+_beeu_mod_inverse_vartime:
+
+_CET_ENDBR
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ pushq %rbx
+
+ pushq %rsi
+
+
+ subq $80,%rsp
+
+ movq %rdi,0(%rsp)
+
+
+ movq $1,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %rdi,%rdi
+
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ xorq %rbp,%rbp
+
+
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu %xmm0,48(%rsp)
+ vmovdqu %xmm1,64(%rsp)
+
+ vmovdqu 0(%rdx),%xmm0
+ vmovdqu 16(%rdx),%xmm1
+ vmovdqu %xmm0,16(%rsp)
+ vmovdqu %xmm1,32(%rsp)
+
+L$beeu_loop:
+ xorq %rbx,%rbx
+ orq 48(%rsp),%rbx
+ orq 56(%rsp),%rbx
+ orq 64(%rsp),%rbx
+ orq 72(%rsp),%rbx
+ jz L$beeu_loop_end
+
+
+
+
+
+
+
+
+
+
+ movq $1,%rcx
+
+
+L$beeu_shift_loop_XB:
+ movq %rcx,%rbx
+ andq 48(%rsp),%rbx
+ jnz L$beeu_shift_loop_end_XB
+
+
+ movq $1,%rbx
+ andq %r8,%rbx
+ jz L$shift1_0
+ addq 0(%rdx),%r8
+ adcq 8(%rdx),%r9
+ adcq 16(%rdx),%r10
+ adcq 24(%rdx),%r11
+ adcq $0,%rdi
+
+L$shift1_0:
+ shrdq $1,%r9,%r8
+ shrdq $1,%r10,%r9
+ shrdq $1,%r11,%r10
+ shrdq $1,%rdi,%r11
+ shrq $1,%rdi
+
+ shlq $1,%rcx
+
+
+
+
+
+ cmpq $0x8000000,%rcx
+ jne L$beeu_shift_loop_XB
+
+L$beeu_shift_loop_end_XB:
+ bsfq %rcx,%rcx
+ testq %rcx,%rcx
+ jz L$beeu_no_shift_XB
+
+
+
+ movq 8+48(%rsp),%rax
+ movq 16+48(%rsp),%rbx
+ movq 24+48(%rsp),%rsi
+
+ shrdq %cl,%rax,0+48(%rsp)
+ shrdq %cl,%rbx,8+48(%rsp)
+ shrdq %cl,%rsi,16+48(%rsp)
+
+ shrq %cl,%rsi
+ movq %rsi,24+48(%rsp)
+
+
+L$beeu_no_shift_XB:
+
+ movq $1,%rcx
+
+
+L$beeu_shift_loop_YA:
+ movq %rcx,%rbx
+ andq 16(%rsp),%rbx
+ jnz L$beeu_shift_loop_end_YA
+
+
+ movq $1,%rbx
+ andq %r12,%rbx
+ jz L$shift1_1
+ addq 0(%rdx),%r12
+ adcq 8(%rdx),%r13
+ adcq 16(%rdx),%r14
+ adcq 24(%rdx),%r15
+ adcq $0,%rbp
+
+L$shift1_1:
+ shrdq $1,%r13,%r12
+ shrdq $1,%r14,%r13
+ shrdq $1,%r15,%r14
+ shrdq $1,%rbp,%r15
+ shrq $1,%rbp
+
+ shlq $1,%rcx
+
+
+
+
+
+ cmpq $0x8000000,%rcx
+ jne L$beeu_shift_loop_YA
+
+L$beeu_shift_loop_end_YA:
+ bsfq %rcx,%rcx
+ testq %rcx,%rcx
+ jz L$beeu_no_shift_YA
+
+
+
+ movq 8+16(%rsp),%rax
+ movq 16+16(%rsp),%rbx
+ movq 24+16(%rsp),%rsi
+
+ shrdq %cl,%rax,0+16(%rsp)
+ shrdq %cl,%rbx,8+16(%rsp)
+ shrdq %cl,%rsi,16+16(%rsp)
+
+ shrq %cl,%rsi
+ movq %rsi,24+16(%rsp)
+
+
+L$beeu_no_shift_YA:
+
+ movq 48(%rsp),%rax
+ movq 56(%rsp),%rbx
+ movq 64(%rsp),%rsi
+ movq 72(%rsp),%rcx
+ subq 16(%rsp),%rax
+ sbbq 24(%rsp),%rbx
+ sbbq 32(%rsp),%rsi
+ sbbq 40(%rsp),%rcx
+ jnc L$beeu_B_bigger_than_A
+
+
+ movq 16(%rsp),%rax
+ movq 24(%rsp),%rbx
+ movq 32(%rsp),%rsi
+ movq 40(%rsp),%rcx
+ subq 48(%rsp),%rax
+ sbbq 56(%rsp),%rbx
+ sbbq 64(%rsp),%rsi
+ sbbq 72(%rsp),%rcx
+ movq %rax,16(%rsp)
+ movq %rbx,24(%rsp)
+ movq %rsi,32(%rsp)
+ movq %rcx,40(%rsp)
+
+
+ addq %r8,%r12
+ adcq %r9,%r13
+ adcq %r10,%r14
+ adcq %r11,%r15
+ adcq %rdi,%rbp
+ jmp L$beeu_loop
+
+L$beeu_B_bigger_than_A:
+
+ movq %rax,48(%rsp)
+ movq %rbx,56(%rsp)
+ movq %rsi,64(%rsp)
+ movq %rcx,72(%rsp)
+
+
+ addq %r12,%r8
+ adcq %r13,%r9
+ adcq %r14,%r10
+ adcq %r15,%r11
+ adcq %rbp,%rdi
+
+ jmp L$beeu_loop
+
+L$beeu_loop_end:
+
+
+
+
+ movq 16(%rsp),%rbx
+ subq $1,%rbx
+ orq 24(%rsp),%rbx
+ orq 32(%rsp),%rbx
+ orq 40(%rsp),%rbx
+
+ jnz L$beeu_err
+
+
+
+
+ movq 0(%rdx),%r8
+ movq 8(%rdx),%r9
+ movq 16(%rdx),%r10
+ movq 24(%rdx),%r11
+ xorq %rdi,%rdi
+
+L$beeu_reduction_loop:
+ movq %r12,16(%rsp)
+ movq %r13,24(%rsp)
+ movq %r14,32(%rsp)
+ movq %r15,40(%rsp)
+ movq %rbp,48(%rsp)
+
+
+ subq %r8,%r12
+ sbbq %r9,%r13
+ sbbq %r10,%r14
+ sbbq %r11,%r15
+ sbbq $0,%rbp
+
+
+ cmovcq 16(%rsp),%r12
+ cmovcq 24(%rsp),%r13
+ cmovcq 32(%rsp),%r14
+ cmovcq 40(%rsp),%r15
+ jnc L$beeu_reduction_loop
+
+
+ subq %r12,%r8
+ sbbq %r13,%r9
+ sbbq %r14,%r10
+ sbbq %r15,%r11
+
+L$beeu_save:
+
+ movq 0(%rsp),%rdi
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+
+ movq $1,%rax
+ jmp L$beeu_finish
+
+L$beeu_err:
+
+ xorq %rax,%rax
+
+L$beeu_finish:
+ addq $80,%rsp
+
+ popq %rsi
+
+ popq %rbx
+
+ popq %r15
+
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ popq %rbp
+
+ ret
+
+
+
+#endif
diff --git a/gen/bcm/p256_beeu-x86_64-asm-linux.S b/gen/bcm/p256_beeu-x86_64-asm-linux.S
new file mode 100644
index 0000000..40ae58b
--- /dev/null
+++ b/gen/bcm/p256_beeu-x86_64-asm-linux.S
@@ -0,0 +1,336 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.type beeu_mod_inverse_vartime,@function
+.hidden beeu_mod_inverse_vartime
+.globl beeu_mod_inverse_vartime
+.hidden beeu_mod_inverse_vartime
+.align 32
+beeu_mod_inverse_vartime:
+.cfi_startproc
+_CET_ENDBR
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp,-16
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset r12,-24
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset r13,-32
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset r14,-40
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset r15,-48
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbx,-56
+ pushq %rsi
+.cfi_adjust_cfa_offset 8
+.cfi_offset rsi,-64
+
+ subq $80,%rsp
+.cfi_adjust_cfa_offset 80
+ movq %rdi,0(%rsp)
+
+
+ movq $1,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %rdi,%rdi
+
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ xorq %rbp,%rbp
+
+
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu %xmm0,48(%rsp)
+ vmovdqu %xmm1,64(%rsp)
+
+ vmovdqu 0(%rdx),%xmm0
+ vmovdqu 16(%rdx),%xmm1
+ vmovdqu %xmm0,16(%rsp)
+ vmovdqu %xmm1,32(%rsp)
+
+.Lbeeu_loop:
+ xorq %rbx,%rbx
+ orq 48(%rsp),%rbx
+ orq 56(%rsp),%rbx
+ orq 64(%rsp),%rbx
+ orq 72(%rsp),%rbx
+ jz .Lbeeu_loop_end
+
+
+
+
+
+
+
+
+
+
+ movq $1,%rcx
+
+
+.Lbeeu_shift_loop_XB:
+ movq %rcx,%rbx
+ andq 48(%rsp),%rbx
+ jnz .Lbeeu_shift_loop_end_XB
+
+
+ movq $1,%rbx
+ andq %r8,%rbx
+ jz .Lshift1_0
+ addq 0(%rdx),%r8
+ adcq 8(%rdx),%r9
+ adcq 16(%rdx),%r10
+ adcq 24(%rdx),%r11
+ adcq $0,%rdi
+
+.Lshift1_0:
+ shrdq $1,%r9,%r8
+ shrdq $1,%r10,%r9
+ shrdq $1,%r11,%r10
+ shrdq $1,%rdi,%r11
+ shrq $1,%rdi
+
+ shlq $1,%rcx
+
+
+
+
+
+ cmpq $0x8000000,%rcx
+ jne .Lbeeu_shift_loop_XB
+
+.Lbeeu_shift_loop_end_XB:
+ bsfq %rcx,%rcx
+ testq %rcx,%rcx
+ jz .Lbeeu_no_shift_XB
+
+
+
+ movq 8+48(%rsp),%rax
+ movq 16+48(%rsp),%rbx
+ movq 24+48(%rsp),%rsi
+
+ shrdq %cl,%rax,0+48(%rsp)
+ shrdq %cl,%rbx,8+48(%rsp)
+ shrdq %cl,%rsi,16+48(%rsp)
+
+ shrq %cl,%rsi
+ movq %rsi,24+48(%rsp)
+
+
+.Lbeeu_no_shift_XB:
+
+ movq $1,%rcx
+
+
+.Lbeeu_shift_loop_YA:
+ movq %rcx,%rbx
+ andq 16(%rsp),%rbx
+ jnz .Lbeeu_shift_loop_end_YA
+
+
+ movq $1,%rbx
+ andq %r12,%rbx
+ jz .Lshift1_1
+ addq 0(%rdx),%r12
+ adcq 8(%rdx),%r13
+ adcq 16(%rdx),%r14
+ adcq 24(%rdx),%r15
+ adcq $0,%rbp
+
+.Lshift1_1:
+ shrdq $1,%r13,%r12
+ shrdq $1,%r14,%r13
+ shrdq $1,%r15,%r14
+ shrdq $1,%rbp,%r15
+ shrq $1,%rbp
+
+ shlq $1,%rcx
+
+
+
+
+
+ cmpq $0x8000000,%rcx
+ jne .Lbeeu_shift_loop_YA
+
+.Lbeeu_shift_loop_end_YA:
+ bsfq %rcx,%rcx
+ testq %rcx,%rcx
+ jz .Lbeeu_no_shift_YA
+
+
+
+ movq 8+16(%rsp),%rax
+ movq 16+16(%rsp),%rbx
+ movq 24+16(%rsp),%rsi
+
+ shrdq %cl,%rax,0+16(%rsp)
+ shrdq %cl,%rbx,8+16(%rsp)
+ shrdq %cl,%rsi,16+16(%rsp)
+
+ shrq %cl,%rsi
+ movq %rsi,24+16(%rsp)
+
+
+.Lbeeu_no_shift_YA:
+
+ movq 48(%rsp),%rax
+ movq 56(%rsp),%rbx
+ movq 64(%rsp),%rsi
+ movq 72(%rsp),%rcx
+ subq 16(%rsp),%rax
+ sbbq 24(%rsp),%rbx
+ sbbq 32(%rsp),%rsi
+ sbbq 40(%rsp),%rcx
+ jnc .Lbeeu_B_bigger_than_A
+
+
+ movq 16(%rsp),%rax
+ movq 24(%rsp),%rbx
+ movq 32(%rsp),%rsi
+ movq 40(%rsp),%rcx
+ subq 48(%rsp),%rax
+ sbbq 56(%rsp),%rbx
+ sbbq 64(%rsp),%rsi
+ sbbq 72(%rsp),%rcx
+ movq %rax,16(%rsp)
+ movq %rbx,24(%rsp)
+ movq %rsi,32(%rsp)
+ movq %rcx,40(%rsp)
+
+
+ addq %r8,%r12
+ adcq %r9,%r13
+ adcq %r10,%r14
+ adcq %r11,%r15
+ adcq %rdi,%rbp
+ jmp .Lbeeu_loop
+
+.Lbeeu_B_bigger_than_A:
+
+ movq %rax,48(%rsp)
+ movq %rbx,56(%rsp)
+ movq %rsi,64(%rsp)
+ movq %rcx,72(%rsp)
+
+
+ addq %r12,%r8
+ adcq %r13,%r9
+ adcq %r14,%r10
+ adcq %r15,%r11
+ adcq %rbp,%rdi
+
+ jmp .Lbeeu_loop
+
+.Lbeeu_loop_end:
+
+
+
+
+ movq 16(%rsp),%rbx
+ subq $1,%rbx
+ orq 24(%rsp),%rbx
+ orq 32(%rsp),%rbx
+ orq 40(%rsp),%rbx
+
+ jnz .Lbeeu_err
+
+
+
+
+ movq 0(%rdx),%r8
+ movq 8(%rdx),%r9
+ movq 16(%rdx),%r10
+ movq 24(%rdx),%r11
+ xorq %rdi,%rdi
+
+.Lbeeu_reduction_loop:
+ movq %r12,16(%rsp)
+ movq %r13,24(%rsp)
+ movq %r14,32(%rsp)
+ movq %r15,40(%rsp)
+ movq %rbp,48(%rsp)
+
+
+ subq %r8,%r12
+ sbbq %r9,%r13
+ sbbq %r10,%r14
+ sbbq %r11,%r15
+ sbbq $0,%rbp
+
+
+ cmovcq 16(%rsp),%r12
+ cmovcq 24(%rsp),%r13
+ cmovcq 32(%rsp),%r14
+ cmovcq 40(%rsp),%r15
+ jnc .Lbeeu_reduction_loop
+
+
+ subq %r12,%r8
+ sbbq %r13,%r9
+ sbbq %r14,%r10
+ sbbq %r15,%r11
+
+.Lbeeu_save:
+
+ movq 0(%rsp),%rdi
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+
+ movq $1,%rax
+ jmp .Lbeeu_finish
+
+.Lbeeu_err:
+
+ xorq %rax,%rax
+
+.Lbeeu_finish:
+ addq $80,%rsp
+.cfi_adjust_cfa_offset -80
+ popq %rsi
+.cfi_adjust_cfa_offset -8
+.cfi_restore rsi
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore rbx
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore rbp
+ ret
+.cfi_endproc
+
+.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
+#endif
diff --git a/gen/bcm/p256_beeu-x86_64-asm-win.asm b/gen/bcm/p256_beeu-x86_64-asm-win.asm
new file mode 100644
index 0000000..7c7da68
--- /dev/null
+++ b/gen/bcm/p256_beeu-x86_64-asm-win.asm
@@ -0,0 +1,346 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+
+
+global beeu_mod_inverse_vartime
+ALIGN 32
+beeu_mod_inverse_vartime:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_beeu_mod_inverse_vartime:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ push rbx
+
+ push rsi
+
+
+ sub rsp,80
+
+ mov QWORD[rsp],rdi
+
+
+ mov r8,1
+ xor r9,r9
+ xor r10,r10
+ xor r11,r11
+ xor rdi,rdi
+
+ xor r12,r12
+ xor r13,r13
+ xor r14,r14
+ xor r15,r15
+ xor rbp,rbp
+
+
+ vmovdqu xmm0,XMMWORD[rsi]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu XMMWORD[48+rsp],xmm0
+ vmovdqu XMMWORD[64+rsp],xmm1
+
+ vmovdqu xmm0,XMMWORD[rdx]
+ vmovdqu xmm1,XMMWORD[16+rdx]
+ vmovdqu XMMWORD[16+rsp],xmm0
+ vmovdqu XMMWORD[32+rsp],xmm1
+
+$L$beeu_loop:
+ xor rbx,rbx
+ or rbx,QWORD[48+rsp]
+ or rbx,QWORD[56+rsp]
+ or rbx,QWORD[64+rsp]
+ or rbx,QWORD[72+rsp]
+ jz NEAR $L$beeu_loop_end
+
+
+
+
+
+
+
+
+
+
+ mov rcx,1
+
+
+$L$beeu_shift_loop_XB:
+ mov rbx,rcx
+ and rbx,QWORD[48+rsp]
+ jnz NEAR $L$beeu_shift_loop_end_XB
+
+
+ mov rbx,1
+ and rbx,r8
+ jz NEAR $L$shift1_0
+ add r8,QWORD[rdx]
+ adc r9,QWORD[8+rdx]
+ adc r10,QWORD[16+rdx]
+ adc r11,QWORD[24+rdx]
+ adc rdi,0
+
+$L$shift1_0:
+ shrd r8,r9,1
+ shrd r9,r10,1
+ shrd r10,r11,1
+ shrd r11,rdi,1
+ shr rdi,1
+
+ shl rcx,1
+
+
+
+
+
+ cmp rcx,0x8000000
+ jne NEAR $L$beeu_shift_loop_XB
+
+$L$beeu_shift_loop_end_XB:
+ bsf rcx,rcx
+ test rcx,rcx
+ jz NEAR $L$beeu_no_shift_XB
+
+
+
+ mov rax,QWORD[((8+48))+rsp]
+ mov rbx,QWORD[((16+48))+rsp]
+ mov rsi,QWORD[((24+48))+rsp]
+
+ shrd QWORD[((0+48))+rsp],rax,cl
+ shrd QWORD[((8+48))+rsp],rbx,cl
+ shrd QWORD[((16+48))+rsp],rsi,cl
+
+ shr rsi,cl
+ mov QWORD[((24+48))+rsp],rsi
+
+
+$L$beeu_no_shift_XB:
+
+ mov rcx,1
+
+
+$L$beeu_shift_loop_YA:
+ mov rbx,rcx
+ and rbx,QWORD[16+rsp]
+ jnz NEAR $L$beeu_shift_loop_end_YA
+
+
+ mov rbx,1
+ and rbx,r12
+ jz NEAR $L$shift1_1
+ add r12,QWORD[rdx]
+ adc r13,QWORD[8+rdx]
+ adc r14,QWORD[16+rdx]
+ adc r15,QWORD[24+rdx]
+ adc rbp,0
+
+$L$shift1_1:
+ shrd r12,r13,1
+ shrd r13,r14,1
+ shrd r14,r15,1
+ shrd r15,rbp,1
+ shr rbp,1
+
+ shl rcx,1
+
+
+
+
+
+ cmp rcx,0x8000000
+ jne NEAR $L$beeu_shift_loop_YA
+
+$L$beeu_shift_loop_end_YA:
+ bsf rcx,rcx
+ test rcx,rcx
+ jz NEAR $L$beeu_no_shift_YA
+
+
+
+ mov rax,QWORD[((8+16))+rsp]
+ mov rbx,QWORD[((16+16))+rsp]
+ mov rsi,QWORD[((24+16))+rsp]
+
+ shrd QWORD[((0+16))+rsp],rax,cl
+ shrd QWORD[((8+16))+rsp],rbx,cl
+ shrd QWORD[((16+16))+rsp],rsi,cl
+
+ shr rsi,cl
+ mov QWORD[((24+16))+rsp],rsi
+
+
+$L$beeu_no_shift_YA:
+
+ mov rax,QWORD[48+rsp]
+ mov rbx,QWORD[56+rsp]
+ mov rsi,QWORD[64+rsp]
+ mov rcx,QWORD[72+rsp]
+ sub rax,QWORD[16+rsp]
+ sbb rbx,QWORD[24+rsp]
+ sbb rsi,QWORD[32+rsp]
+ sbb rcx,QWORD[40+rsp]
+ jnc NEAR $L$beeu_B_bigger_than_A
+
+
+ mov rax,QWORD[16+rsp]
+ mov rbx,QWORD[24+rsp]
+ mov rsi,QWORD[32+rsp]
+ mov rcx,QWORD[40+rsp]
+ sub rax,QWORD[48+rsp]
+ sbb rbx,QWORD[56+rsp]
+ sbb rsi,QWORD[64+rsp]
+ sbb rcx,QWORD[72+rsp]
+ mov QWORD[16+rsp],rax
+ mov QWORD[24+rsp],rbx
+ mov QWORD[32+rsp],rsi
+ mov QWORD[40+rsp],rcx
+
+
+ add r12,r8
+ adc r13,r9
+ adc r14,r10
+ adc r15,r11
+ adc rbp,rdi
+ jmp NEAR $L$beeu_loop
+
+$L$beeu_B_bigger_than_A:
+
+ mov QWORD[48+rsp],rax
+ mov QWORD[56+rsp],rbx
+ mov QWORD[64+rsp],rsi
+ mov QWORD[72+rsp],rcx
+
+
+ add r8,r12
+ adc r9,r13
+ adc r10,r14
+ adc r11,r15
+ adc rdi,rbp
+
+ jmp NEAR $L$beeu_loop
+
+$L$beeu_loop_end:
+
+
+
+
+ mov rbx,QWORD[16+rsp]
+ sub rbx,1
+ or rbx,QWORD[24+rsp]
+ or rbx,QWORD[32+rsp]
+ or rbx,QWORD[40+rsp]
+
+ jnz NEAR $L$beeu_err
+
+
+
+
+ mov r8,QWORD[rdx]
+ mov r9,QWORD[8+rdx]
+ mov r10,QWORD[16+rdx]
+ mov r11,QWORD[24+rdx]
+ xor rdi,rdi
+
+$L$beeu_reduction_loop:
+ mov QWORD[16+rsp],r12
+ mov QWORD[24+rsp],r13
+ mov QWORD[32+rsp],r14
+ mov QWORD[40+rsp],r15
+ mov QWORD[48+rsp],rbp
+
+
+ sub r12,r8
+ sbb r13,r9
+ sbb r14,r10
+ sbb r15,r11
+ sbb rbp,0
+
+
+ cmovc r12,QWORD[16+rsp]
+ cmovc r13,QWORD[24+rsp]
+ cmovc r14,QWORD[32+rsp]
+ cmovc r15,QWORD[40+rsp]
+ jnc NEAR $L$beeu_reduction_loop
+
+
+ sub r8,r12
+ sbb r9,r13
+ sbb r10,r14
+ sbb r11,r15
+
+$L$beeu_save:
+
+ mov rdi,QWORD[rsp]
+
+ mov QWORD[rdi],r8
+ mov QWORD[8+rdi],r9
+ mov QWORD[16+rdi],r10
+ mov QWORD[24+rdi],r11
+
+
+ mov rax,1
+ jmp NEAR $L$beeu_finish
+
+$L$beeu_err:
+
+ xor rax,rax
+
+$L$beeu_finish:
+ add rsp,80
+
+ pop rsi
+
+ pop rbx
+
+ pop r15
+
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ pop rbp
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+
+$L$SEH_end_beeu_mod_inverse_vartime:
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/rdrand-x86_64-apple.S b/gen/bcm/rdrand-x86_64-apple.S
new file mode 100644
index 0000000..5fdf105
--- /dev/null
+++ b/gen/bcm/rdrand-x86_64-apple.S
@@ -0,0 +1,57 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+
+
+.globl _CRYPTO_rdrand
+.private_extern _CRYPTO_rdrand
+
+.p2align 4
+_CRYPTO_rdrand:
+
+_CET_ENDBR
+ xorq %rax,%rax
+.byte 72,15,199,242
+
+ adcq %rax,%rax
+ movq %rdx,0(%rdi)
+ ret
+
+
+
+
+
+
+
+.globl _CRYPTO_rdrand_multiple8_buf
+.private_extern _CRYPTO_rdrand_multiple8_buf
+
+.p2align 4
+_CRYPTO_rdrand_multiple8_buf:
+
+_CET_ENDBR
+ testq %rsi,%rsi
+ jz L$out
+ movq $8,%rdx
+L$loop:
+.byte 72,15,199,241
+ jnc L$err
+ movq %rcx,0(%rdi)
+ addq %rdx,%rdi
+ subq %rdx,%rsi
+ jnz L$loop
+L$out:
+ movq $1,%rax
+ ret
+L$err:
+ xorq %rax,%rax
+ ret
+
+
+#endif
diff --git a/gen/bcm/rdrand-x86_64-linux.S b/gen/bcm/rdrand-x86_64-linux.S
new file mode 100644
index 0000000..fe81dac
--- /dev/null
+++ b/gen/bcm/rdrand-x86_64-linux.S
@@ -0,0 +1,57 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+
+
+
+.globl CRYPTO_rdrand
+.hidden CRYPTO_rdrand
+.type CRYPTO_rdrand,@function
+.align 16
+CRYPTO_rdrand:
+.cfi_startproc
+_CET_ENDBR
+ xorq %rax,%rax
+.byte 72,15,199,242
+
+ adcq %rax,%rax
+ movq %rdx,0(%rdi)
+ ret
+.cfi_endproc
+.size CRYPTO_rdrand,.-CRYPTO_rdrand
+
+
+
+
+
+.globl CRYPTO_rdrand_multiple8_buf
+.hidden CRYPTO_rdrand_multiple8_buf
+.type CRYPTO_rdrand_multiple8_buf,@function
+.align 16
+CRYPTO_rdrand_multiple8_buf:
+.cfi_startproc
+_CET_ENDBR
+ testq %rsi,%rsi
+ jz .Lout
+ movq $8,%rdx
+.Lloop:
+.byte 72,15,199,241
+ jnc .Lerr
+ movq %rcx,0(%rdi)
+ addq %rdx,%rdi
+ subq %rdx,%rsi
+ jnz .Lloop
+.Lout:
+ movq $1,%rax
+ ret
+.Lerr:
+ xorq %rax,%rax
+ ret
+.cfi_endproc
+.size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf
+#endif
diff --git a/gen/bcm/rdrand-x86_64-win.asm b/gen/bcm/rdrand-x86_64-win.asm
new file mode 100644
index 0000000..aae3d76
--- /dev/null
+++ b/gen/bcm/rdrand-x86_64-win.asm
@@ -0,0 +1,66 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+
+
+
+global CRYPTO_rdrand
+
+ALIGN 16
+CRYPTO_rdrand:
+
+_CET_ENDBR
+ xor rax,rax
+DB 73,15,199,240
+
+ adc rax,rax
+ mov QWORD[rcx],r8
+ ret
+
+
+
+
+
+
+
+global CRYPTO_rdrand_multiple8_buf
+
+ALIGN 16
+CRYPTO_rdrand_multiple8_buf:
+
+_CET_ENDBR
+ test rdx,rdx
+ jz NEAR $L$out
+ mov r8,8
+$L$loop:
+DB 73,15,199,241
+ jnc NEAR $L$err
+ mov QWORD[rcx],r9
+ add rcx,r8
+ sub rdx,r8
+ jnz NEAR $L$loop
+$L$out:
+ mov rax,1
+ ret
+$L$err:
+ xor rax,rax
+ ret
+
+
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/rsaz-avx2-apple.S b/gen/bcm/rsaz-avx2-apple.S
new file mode 100644
index 0000000..3672309
--- /dev/null
+++ b/gen/bcm/rsaz-avx2-apple.S
@@ -0,0 +1,1749 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+.globl _rsaz_1024_sqr_avx2
+.private_extern _rsaz_1024_sqr_avx2
+
+.p2align 6
+_rsaz_1024_sqr_avx2:
+
+_CET_ENDBR
+ leaq (%rsp),%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ vzeroupper
+ movq %rax,%rbp
+
+ movq %rdx,%r13
+ subq $832,%rsp
+ movq %r13,%r15
+ subq $-128,%rdi
+ subq $-128,%rsi
+ subq $-128,%r13
+
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ vpxor %ymm9,%ymm9,%ymm9
+ jz L$sqr_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%r13),%ymm0
+ andq $-2048,%rsp
+ vmovdqu 32-128(%r13),%ymm1
+ vmovdqu 64-128(%r13),%ymm2
+ vmovdqu 96-128(%r13),%ymm3
+ vmovdqu 128-128(%r13),%ymm4
+ vmovdqu 160-128(%r13),%ymm5
+ vmovdqu 192-128(%r13),%ymm6
+ vmovdqu 224-128(%r13),%ymm7
+ vmovdqu 256-128(%r13),%ymm8
+ leaq 832+128(%rsp),%r13
+ vmovdqu %ymm0,0-128(%r13)
+ vmovdqu %ymm1,32-128(%r13)
+ vmovdqu %ymm2,64-128(%r13)
+ vmovdqu %ymm3,96-128(%r13)
+ vmovdqu %ymm4,128-128(%r13)
+ vmovdqu %ymm5,160-128(%r13)
+ vmovdqu %ymm6,192-128(%r13)
+ vmovdqu %ymm7,224-128(%r13)
+ vmovdqu %ymm8,256-128(%r13)
+ vmovdqu %ymm9,288-128(%r13)
+
+L$sqr_1024_no_n_copy:
+ andq $-1024,%rsp
+
+ vmovdqu 32-128(%rsi),%ymm1
+ vmovdqu 64-128(%rsi),%ymm2
+ vmovdqu 96-128(%rsi),%ymm3
+ vmovdqu 128-128(%rsi),%ymm4
+ vmovdqu 160-128(%rsi),%ymm5
+ vmovdqu 192-128(%rsi),%ymm6
+ vmovdqu 224-128(%rsi),%ymm7
+ vmovdqu 256-128(%rsi),%ymm8
+
+ leaq 192(%rsp),%rbx
+ vmovdqu L$and_mask(%rip),%ymm15
+ jmp L$OOP_GRANDE_SQR_1024
+
+.p2align 5
+L$OOP_GRANDE_SQR_1024:
+ leaq 576+128(%rsp),%r9
+ leaq 448(%rsp),%r12
+
+
+
+
+ vpaddq %ymm1,%ymm1,%ymm1
+ vpbroadcastq 0-128(%rsi),%ymm10
+ vpaddq %ymm2,%ymm2,%ymm2
+ vmovdqa %ymm1,0-128(%r9)
+ vpaddq %ymm3,%ymm3,%ymm3
+ vmovdqa %ymm2,32-128(%r9)
+ vpaddq %ymm4,%ymm4,%ymm4
+ vmovdqa %ymm3,64-128(%r9)
+ vpaddq %ymm5,%ymm5,%ymm5
+ vmovdqa %ymm4,96-128(%r9)
+ vpaddq %ymm6,%ymm6,%ymm6
+ vmovdqa %ymm5,128-128(%r9)
+ vpaddq %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm6,160-128(%r9)
+ vpaddq %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm7,192-128(%r9)
+ vpxor %ymm9,%ymm9,%ymm9
+ vmovdqa %ymm8,224-128(%r9)
+
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpbroadcastq 32-128(%rsi),%ymm11
+ vmovdqu %ymm9,288-192(%rbx)
+ vpmuludq %ymm10,%ymm1,%ymm1
+ vmovdqu %ymm9,320-448(%r12)
+ vpmuludq %ymm10,%ymm2,%ymm2
+ vmovdqu %ymm9,352-448(%r12)
+ vpmuludq %ymm10,%ymm3,%ymm3
+ vmovdqu %ymm9,384-448(%r12)
+ vpmuludq %ymm10,%ymm4,%ymm4
+ vmovdqu %ymm9,416-448(%r12)
+ vpmuludq %ymm10,%ymm5,%ymm5
+ vmovdqu %ymm9,448-448(%r12)
+ vpmuludq %ymm10,%ymm6,%ymm6
+ vmovdqu %ymm9,480-448(%r12)
+ vpmuludq %ymm10,%ymm7,%ymm7
+ vmovdqu %ymm9,512-448(%r12)
+ vpmuludq %ymm10,%ymm8,%ymm8
+ vpbroadcastq 64-128(%rsi),%ymm10
+ vmovdqu %ymm9,544-448(%r12)
+
+ movq %rsi,%r15
+ movl $4,%r14d
+ jmp L$sqr_entry_1024
+.p2align 5
+L$OOP_SQR_1024:
+ vpbroadcastq 32-128(%r15),%ymm11
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpaddq 0-192(%rbx),%ymm0,%ymm0
+ vpmuludq 0-128(%r9),%ymm10,%ymm1
+ vpaddq 32-192(%rbx),%ymm1,%ymm1
+ vpmuludq 32-128(%r9),%ymm10,%ymm2
+ vpaddq 64-192(%rbx),%ymm2,%ymm2
+ vpmuludq 64-128(%r9),%ymm10,%ymm3
+ vpaddq 96-192(%rbx),%ymm3,%ymm3
+ vpmuludq 96-128(%r9),%ymm10,%ymm4
+ vpaddq 128-192(%rbx),%ymm4,%ymm4
+ vpmuludq 128-128(%r9),%ymm10,%ymm5
+ vpaddq 160-192(%rbx),%ymm5,%ymm5
+ vpmuludq 160-128(%r9),%ymm10,%ymm6
+ vpaddq 192-192(%rbx),%ymm6,%ymm6
+ vpmuludq 192-128(%r9),%ymm10,%ymm7
+ vpaddq 224-192(%rbx),%ymm7,%ymm7
+ vpmuludq 224-128(%r9),%ymm10,%ymm8
+ vpbroadcastq 64-128(%r15),%ymm10
+ vpaddq 256-192(%rbx),%ymm8,%ymm8
+L$sqr_entry_1024:
+ vmovdqu %ymm0,0-192(%rbx)
+ vmovdqu %ymm1,32-192(%rbx)
+
+ vpmuludq 32-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 32-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 64-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 96-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 128-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 160-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 192-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 224-128(%r9),%ymm11,%ymm0
+ vpbroadcastq 96-128(%r15),%ymm11
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+
+ vmovdqu %ymm2,64-192(%rbx)
+ vmovdqu %ymm3,96-192(%rbx)
+
+ vpmuludq 64-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 64-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 96-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 128-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 160-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 224-128(%r9),%ymm10,%ymm1
+ vpbroadcastq 128-128(%r15),%ymm10
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+
+ vmovdqu %ymm4,128-192(%rbx)
+ vmovdqu %ymm5,160-192(%rbx)
+
+ vpmuludq 96-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 96-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq 128-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm0,%ymm0
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq 224-128(%r9),%ymm11,%ymm2
+ vpbroadcastq 160-128(%r15),%ymm11
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+
+ vmovdqu %ymm6,192-192(%rbx)
+ vmovdqu %ymm7,224-192(%rbx)
+
+ vpmuludq 128-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 128-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 160-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 192-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 224-128(%r9),%ymm10,%ymm3
+ vpbroadcastq 192-128(%r15),%ymm10
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+
+ vmovdqu %ymm8,256-192(%rbx)
+ vmovdqu %ymm0,288-192(%rbx)
+ leaq 8(%rbx),%rbx
+
+ vpmuludq 160-128(%rsi),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 224-128(%r9),%ymm11,%ymm4
+ vpbroadcastq 224-128(%r15),%ymm11
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+
+ vmovdqu %ymm1,320-448(%r12)
+ vmovdqu %ymm2,352-448(%r12)
+
+ vpmuludq 192-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpbroadcastq 256-128(%r15),%ymm0
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq 224-128(%r9),%ymm10,%ymm5
+ vpbroadcastq 0+8-128(%r15),%ymm10
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+
+ vmovdqu %ymm3,384-448(%r12)
+ vmovdqu %ymm4,416-448(%r12)
+ leaq 8(%r15),%r15
+
+ vpmuludq 224-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 224-128(%r9),%ymm11,%ymm6
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+
+ vpmuludq 256-128(%rsi),%ymm0,%ymm7
+ vmovdqu %ymm5,448-448(%r12)
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vmovdqu %ymm6,480-448(%r12)
+ vmovdqu %ymm7,512-448(%r12)
+ leaq 8(%r12),%r12
+
+ decl %r14d
+ jnz L$OOP_SQR_1024
+
+ vmovdqu 256(%rsp),%ymm8
+ vmovdqu 288(%rsp),%ymm1
+ vmovdqu 320(%rsp),%ymm2
+ leaq 192(%rsp),%rbx
+
+ vpsrlq $29,%ymm8,%ymm14
+ vpand %ymm15,%ymm8,%ymm8
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+
+ vpermq $0x93,%ymm14,%ymm14
+ vpxor %ymm9,%ymm9,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpblendd $3,%ymm11,%ymm9,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,288-192(%rbx)
+ vmovdqu %ymm2,320-192(%rbx)
+
+ movq (%rsp),%rax
+ movq 8(%rsp),%r10
+ movq 16(%rsp),%r11
+ movq 24(%rsp),%r12
+ vmovdqu 32(%rsp),%ymm1
+ vmovdqu 64-192(%rbx),%ymm2
+ vmovdqu 96-192(%rbx),%ymm3
+ vmovdqu 128-192(%rbx),%ymm4
+ vmovdqu 160-192(%rbx),%ymm5
+ vmovdqu 192-192(%rbx),%ymm6
+ vmovdqu 224-192(%rbx),%ymm7
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpbroadcastq %xmm12,%ymm12
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ shrq $29,%r9
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ addq %r9,%r10
+ addq %rax,%r11
+ imulq 24-128(%r13),%rdx
+ addq %rdx,%r12
+
+ movq %r10,%rax
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+
+ movl $9,%r14d
+ jmp L$OOP_REDUCE_1024
+
+.p2align 5
+L$OOP_REDUCE_1024:
+ vmovd %eax,%xmm13
+ vpbroadcastq %xmm13,%ymm13
+
+ vpmuludq 32-128(%r13),%ymm12,%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm10,%ymm1,%ymm1
+ addq %rax,%r10
+ vpmuludq 64-128(%r13),%ymm12,%ymm14
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm14,%ymm2,%ymm2
+ vpmuludq 96-128(%r13),%ymm12,%ymm11
+.byte 0x67
+ addq %rax,%r11
+.byte 0x67
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ shrq $29,%r10
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpmuludq 128-128(%r13),%ymm12,%ymm10
+ addq %rax,%r12
+ addq %r10,%r11
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpmuludq 160-128(%r13),%ymm12,%ymm14
+ movq %r11,%rax
+ imull %ecx,%eax
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpmuludq 192-128(%r13),%ymm12,%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpmuludq 224-128(%r13),%ymm12,%ymm10
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpmuludq 256-128(%r13),%ymm12,%ymm14
+ vmovd %eax,%xmm12
+
+ vpaddq %ymm14,%ymm8,%ymm8
+
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 32-8-128(%r13),%ymm13,%ymm11
+ vmovdqu 96-8-128(%r13),%ymm14
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpmuludq 64-8-128(%r13),%ymm13,%ymm10
+ vmovdqu 128-8-128(%r13),%ymm11
+ addq %rax,%r11
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm10,%ymm2,%ymm2
+ addq %r12,%rax
+ shrq $29,%r11
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 160-8-128(%r13),%ymm10
+ addq %r11,%rax
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 192-8-128(%r13),%ymm14
+.byte 0x67
+ movq %rax,%r12
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpmuludq %ymm13,%ymm10,%ymm10
+.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm5,%ymm5
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 256-8-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 288-8-128(%r13),%ymm9
+ vmovd %eax,%xmm0
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpmuludq %ymm13,%ymm10,%ymm10
+ vmovdqu 32-16-128(%r13),%ymm14
+ vpbroadcastq %xmm0,%ymm0
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpmuludq %ymm13,%ymm9,%ymm9
+ vmovdqu 64-16-128(%r13),%ymm11
+ addq %rax,%r12
+
+ vmovdqu 32-24-128(%r13),%ymm13
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 96-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq %ymm0,%ymm13,%ymm13
+ vpmuludq %ymm12,%ymm11,%ymm11
+.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+ vpaddq %ymm1,%ymm13,%ymm13
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 160-16-128(%r13),%ymm11
+.byte 0x67
+ vmovq %xmm13,%rax
+ vmovdqu %ymm13,(%rsp)
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 192-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq %ymm12,%ymm11,%ymm11
+ vmovdqu 224-16-128(%r13),%ymm14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 256-16-128(%r13),%ymm11
+ vpaddq %ymm10,%ymm6,%ymm6
+ vpmuludq %ymm12,%ymm14,%ymm14
+ shrq $29,%r12
+ vmovdqu 288-16-128(%r13),%ymm10
+ addq %r12,%rax
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq %ymm12,%ymm11,%ymm11
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm8,%ymm8
+ vpmuludq %ymm12,%ymm10,%ymm10
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+ vmovdqu 96-24-128(%r13),%ymm11
+.byte 0x67
+ vpaddq %ymm10,%ymm9,%ymm9
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 64-24-128(%r13),%ymm0,%ymm14
+ vmovdqu 128-24-128(%r13),%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ movq 8(%rsp),%r10
+ vpaddq %ymm14,%ymm2,%ymm1
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 160-24-128(%r13),%ymm14
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+.byte 0x67
+ shrq $29,%r9
+ movq 16(%rsp),%r11
+ vpaddq %ymm11,%ymm3,%ymm2
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vmovdqu 192-24-128(%r13),%ymm11
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ vpaddq %ymm10,%ymm4,%ymm3
+ vpmuludq %ymm0,%ymm14,%ymm14
+ vmovdqu 224-24-128(%r13),%ymm10
+ imulq 24-128(%r13),%rdx
+ addq %rax,%r11
+ leaq (%r9,%r10,1),%rax
+ vpaddq %ymm14,%ymm5,%ymm4
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 256-24-128(%r13),%ymm14
+ movq %rax,%r10
+ imull %ecx,%eax
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vpaddq %ymm11,%ymm6,%ymm5
+ vmovdqu 288-24-128(%r13),%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm7,%ymm6
+ vpmuludq %ymm0,%ymm14,%ymm14
+ addq 24(%rsp),%rdx
+ vpaddq %ymm14,%ymm8,%ymm7
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vpaddq %ymm11,%ymm9,%ymm8
+ vmovq %r12,%xmm9
+ movq %rdx,%r12
+
+ decl %r14d
+ jnz L$OOP_REDUCE_1024
+ leaq 448(%rsp),%r12
+ vpaddq %ymm9,%ymm13,%ymm0
+ vpxor %ymm9,%ymm9,%ymm9
+
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vpaddq 544-448(%r12),%ymm8,%ymm8
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vmovdqu %ymm0,0-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,32-128(%rdi)
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vmovdqu %ymm2,64-128(%rdi)
+ vpaddq %ymm13,%ymm4,%ymm4
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vmovdqu %ymm4,128-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vmovdqu %ymm5,160-128(%rdi)
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vmovdqu %ymm6,192-128(%rdi)
+ vpaddq %ymm13,%ymm8,%ymm8
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+
+ movq %rdi,%rsi
+ decl %r8d
+ jne L$OOP_GRANDE_SQR_1024
+
+ vzeroall
+ movq %rbp,%rax
+
+ movq -48(%rax),%r15
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbp
+
+ movq -8(%rax),%rbx
+
+ leaq (%rax),%rsp
+
+L$sqr_1024_epilogue:
+ ret
+
+
+.globl _rsaz_1024_mul_avx2
+.private_extern _rsaz_1024_mul_avx2
+
+.p2align 6
+_rsaz_1024_mul_avx2:
+
+_CET_ENDBR
+ leaq (%rsp),%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ movq %rax,%rbp
+
+ vzeroall
+ movq %rdx,%r13
+ subq $64,%rsp
+
+
+
+
+
+
+.byte 0x67,0x67
+ movq %rsi,%r15
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ movq %rsi,%r15
+ cmovnzq %r13,%rsi
+ cmovnzq %r15,%r13
+
+ movq %rcx,%r15
+ subq $-128,%rsi
+ subq $-128,%rcx
+ subq $-128,%rdi
+
+ andq $4095,%r15
+ addq $320,%r15
+.byte 0x67,0x67
+ shrq $12,%r15
+ jz L$mul_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%rcx),%ymm0
+ andq $-512,%rsp
+ vmovdqu 32-128(%rcx),%ymm1
+ vmovdqu 64-128(%rcx),%ymm2
+ vmovdqu 96-128(%rcx),%ymm3
+ vmovdqu 128-128(%rcx),%ymm4
+ vmovdqu 160-128(%rcx),%ymm5
+ vmovdqu 192-128(%rcx),%ymm6
+ vmovdqu 224-128(%rcx),%ymm7
+ vmovdqu 256-128(%rcx),%ymm8
+ leaq 64+128(%rsp),%rcx
+ vmovdqu %ymm0,0-128(%rcx)
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm1,32-128(%rcx)
+ vpxor %ymm1,%ymm1,%ymm1
+ vmovdqu %ymm2,64-128(%rcx)
+ vpxor %ymm2,%ymm2,%ymm2
+ vmovdqu %ymm3,96-128(%rcx)
+ vpxor %ymm3,%ymm3,%ymm3
+ vmovdqu %ymm4,128-128(%rcx)
+ vpxor %ymm4,%ymm4,%ymm4
+ vmovdqu %ymm5,160-128(%rcx)
+ vpxor %ymm5,%ymm5,%ymm5
+ vmovdqu %ymm6,192-128(%rcx)
+ vpxor %ymm6,%ymm6,%ymm6
+ vmovdqu %ymm7,224-128(%rcx)
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqu %ymm8,256-128(%rcx)
+ vmovdqa %ymm0,%ymm8
+ vmovdqu %ymm9,288-128(%rcx)
+L$mul_1024_no_n_copy:
+ andq $-64,%rsp
+
+ movq (%r13),%rbx
+ vpbroadcastq (%r13),%ymm10
+ vmovdqu %ymm0,(%rsp)
+ xorq %r9,%r9
+.byte 0x67
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+
+ vmovdqu L$and_mask(%rip),%ymm15
+ movl $9,%r14d
+ vmovdqu %ymm9,288-128(%rdi)
+ jmp L$oop_mul_1024
+
+.p2align 5
+L$oop_mul_1024:
+ vpsrlq $29,%ymm3,%ymm9
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r9,%rax
+ movq %rbx,%r10
+ imulq 8-128(%rsi),%r10
+ addq 8(%rsp),%r10
+
+ movq %rax,%r9
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ movq %rbx,%r11
+ imulq 16-128(%rsi),%r11
+ addq 16(%rsp),%r11
+
+ movq %rbx,%r12
+ imulq 24-128(%rsi),%r12
+ addq 24(%rsp),%r12
+ vpmuludq 32-128(%rsi),%ymm10,%ymm0
+ vmovd %eax,%xmm11
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq 64-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 96-128(%rsi),%ymm10,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq 128-128(%rsi),%ymm10,%ymm0
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq 160-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 192-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq 224-128(%rsi),%ymm10,%ymm0
+ vpermq $0x93,%ymm9,%ymm9
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq 256-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq 8(%r13),%ymm10
+ vpaddq %ymm12,%ymm8,%ymm8
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%rcx),%rax
+ addq %rax,%r11
+ shrq $29,%r9
+ imulq 24-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r9,%r10
+
+ vpmuludq 32-128(%rcx),%ymm11,%ymm13
+ vmovq %xmm10,%rbx
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 64-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm2,%ymm2
+ vpmuludq 96-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 128-128(%rcx),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 160-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm5,%ymm5
+ vpmuludq 192-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 224-128(%rcx),%ymm11,%ymm13
+ vpblendd $3,%ymm14,%ymm9,%ymm12
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 256-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm0,%ymm8,%ymm8
+
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rsi),%ymm12
+ movq %rbx,%rax
+ imulq 8-128(%rsi),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rsi),%ymm13
+
+ movq %r10,%rax
+ vpblendd $0xfc,%ymm14,%ymm9,%ymm9
+ imull %r8d,%eax
+ vpaddq %ymm9,%ymm4,%ymm4
+ andl $0x1fffffff,%eax
+
+ imulq 16-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovd %eax,%xmm11
+ vmovdqu -8+96-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -8+128-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+160-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+192-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -8+224-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+256-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+288-128(%rsi),%ymm9
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm9,%ymm9
+ vpbroadcastq 16(%r13),%ymm10
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rcx),%ymm0
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rcx),%ymm12
+ shrq $29,%r10
+ imulq 16-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r10,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -8+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rsi),%ymm0
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r11,%rax
+
+ vmovdqu -16+64-128(%rsi),%ymm12
+ movq %rax,%r11
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ imulq 8-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -16+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -16+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 24(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rcx),%ymm0
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -16+64-128(%rcx),%ymm12
+ imulq 8-128(%rcx),%rdx
+ addq %rdx,%r12
+ shrq $29,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -16+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+32-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+64-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ addq %r11,%r12
+ imulq -128(%rsi),%rbx
+ addq %rbx,%r12
+
+ movq %r12,%rax
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -24+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -24+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 32(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+ addq $32,%r13
+
+ vmovdqu -24+32-128(%rcx),%ymm0
+ imulq -128(%rcx),%rax
+ addq %rax,%r12
+ shrq $29,%r12
+
+ vmovdqu -24+64-128(%rcx),%ymm12
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -24+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm0
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu %ymm0,(%rsp)
+ vpaddq %ymm12,%ymm2,%ymm1
+ vmovdqu -24+128-128(%rcx),%ymm0
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm2
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm3
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm4
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm5
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rcx),%ymm13
+ movq %r12,%r9
+ vpaddq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm11,%ymm12,%ymm12
+ addq (%rsp),%r9
+ vpaddq %ymm12,%ymm8,%ymm7
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovq %r12,%xmm12
+ vpaddq %ymm13,%ymm9,%ymm8
+
+ decl %r14d
+ jnz L$oop_mul_1024
+ vpaddq (%rsp),%ymm12,%ymm0
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm10,%ymm10
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpermq $0x93,%ymm11,%ymm11
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm10,%ymm10
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vmovdqu %ymm0,0-128(%rdi)
+ vmovdqu %ymm1,32-128(%rdi)
+ vmovdqu %ymm2,64-128(%rdi)
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vmovdqu %ymm4,128-128(%rdi)
+ vmovdqu %ymm5,160-128(%rdi)
+ vmovdqu %ymm6,192-128(%rdi)
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+ vzeroupper
+
+ movq %rbp,%rax
+
+ movq -48(%rax),%r15
+
+ movq -40(%rax),%r14
+
+ movq -32(%rax),%r13
+
+ movq -24(%rax),%r12
+
+ movq -16(%rax),%rbp
+
+ movq -8(%rax),%rbx
+
+ leaq (%rax),%rsp
+
+L$mul_1024_epilogue:
+ ret
+
+
+.globl _rsaz_1024_red2norm_avx2
+.private_extern _rsaz_1024_red2norm_avx2
+
+.p2align 5
+_rsaz_1024_red2norm_avx2:
+
+_CET_ENDBR
+ subq $-128,%rsi
+ xorq %rax,%rax
+ movq -128(%rsi),%r8
+ movq -120(%rsi),%r9
+ movq -112(%rsi),%r10
+ shlq $0,%r8
+ shlq $29,%r9
+ movq %r10,%r11
+ shlq $58,%r10
+ shrq $6,%r11
+ addq %r8,%rax
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,0(%rdi)
+ movq %r11,%rax
+ movq -104(%rsi),%r8
+ movq -96(%rsi),%r9
+ shlq $23,%r8
+ movq %r9,%r10
+ shlq $52,%r9
+ shrq $12,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,8(%rdi)
+ movq %r10,%rax
+ movq -88(%rsi),%r11
+ movq -80(%rsi),%r8
+ shlq $17,%r11
+ movq %r8,%r9
+ shlq $46,%r8
+ shrq $18,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,16(%rdi)
+ movq %r9,%rax
+ movq -72(%rsi),%r10
+ movq -64(%rsi),%r11
+ shlq $11,%r10
+ movq %r11,%r8
+ shlq $40,%r11
+ shrq $24,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,24(%rdi)
+ movq %r8,%rax
+ movq -56(%rsi),%r9
+ movq -48(%rsi),%r10
+ movq -40(%rsi),%r11
+ shlq $5,%r9
+ shlq $34,%r10
+ movq %r11,%r8
+ shlq $63,%r11
+ shrq $1,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,32(%rdi)
+ movq %r8,%rax
+ movq -32(%rsi),%r9
+ movq -24(%rsi),%r10
+ shlq $28,%r9
+ movq %r10,%r11
+ shlq $57,%r10
+ shrq $7,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,40(%rdi)
+ movq %r11,%rax
+ movq -16(%rsi),%r8
+ movq -8(%rsi),%r9
+ shlq $22,%r8
+ movq %r9,%r10
+ shlq $51,%r9
+ shrq $13,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,48(%rdi)
+ movq %r10,%rax
+ movq 0(%rsi),%r11
+ movq 8(%rsi),%r8
+ shlq $16,%r11
+ movq %r8,%r9
+ shlq $45,%r8
+ shrq $19,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,56(%rdi)
+ movq %r9,%rax
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ shlq $10,%r10
+ movq %r11,%r8
+ shlq $39,%r11
+ shrq $25,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,64(%rdi)
+ movq %r8,%rax
+ movq 32(%rsi),%r9
+ movq 40(%rsi),%r10
+ movq 48(%rsi),%r11
+ shlq $4,%r9
+ shlq $33,%r10
+ movq %r11,%r8
+ shlq $62,%r11
+ shrq $2,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,72(%rdi)
+ movq %r8,%rax
+ movq 56(%rsi),%r9
+ movq 64(%rsi),%r10
+ shlq $27,%r9
+ movq %r10,%r11
+ shlq $56,%r10
+ shrq $8,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,80(%rdi)
+ movq %r11,%rax
+ movq 72(%rsi),%r8
+ movq 80(%rsi),%r9
+ shlq $21,%r8
+ movq %r9,%r10
+ shlq $50,%r9
+ shrq $14,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,88(%rdi)
+ movq %r10,%rax
+ movq 88(%rsi),%r11
+ movq 96(%rsi),%r8
+ shlq $15,%r11
+ movq %r8,%r9
+ shlq $44,%r8
+ shrq $20,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,96(%rdi)
+ movq %r9,%rax
+ movq 104(%rsi),%r10
+ movq 112(%rsi),%r11
+ shlq $9,%r10
+ movq %r11,%r8
+ shlq $38,%r11
+ shrq $26,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,104(%rdi)
+ movq %r8,%rax
+ movq 120(%rsi),%r9
+ movq 128(%rsi),%r10
+ movq 136(%rsi),%r11
+ shlq $3,%r9
+ shlq $32,%r10
+ movq %r11,%r8
+ shlq $61,%r11
+ shrq $3,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,112(%rdi)
+ movq %r8,%rax
+ movq 144(%rsi),%r9
+ movq 152(%rsi),%r10
+ shlq $26,%r9
+ movq %r10,%r11
+ shlq $55,%r10
+ shrq $9,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,120(%rdi)
+ movq %r11,%rax
+ ret
+
+
+
+.globl _rsaz_1024_norm2red_avx2
+.private_extern _rsaz_1024_norm2red_avx2
+
+.p2align 5
+_rsaz_1024_norm2red_avx2:
+
+_CET_ENDBR
+ subq $-128,%rdi
+ movq (%rsi),%r8
+ movl $0x1fffffff,%eax
+ movq 8(%rsi),%r9
+ movq %r8,%r11
+ shrq $0,%r11
+ andq %rax,%r11
+ movq %r11,-128(%rdi)
+ movq %r8,%r10
+ shrq $29,%r10
+ andq %rax,%r10
+ movq %r10,-120(%rdi)
+ shrdq $58,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-112(%rdi)
+ movq 16(%rsi),%r10
+ movq %r9,%r8
+ shrq $23,%r8
+ andq %rax,%r8
+ movq %r8,-104(%rdi)
+ shrdq $52,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-96(%rdi)
+ movq 24(%rsi),%r11
+ movq %r10,%r9
+ shrq $17,%r9
+ andq %rax,%r9
+ movq %r9,-88(%rdi)
+ shrdq $46,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-80(%rdi)
+ movq 32(%rsi),%r8
+ movq %r11,%r10
+ shrq $11,%r10
+ andq %rax,%r10
+ movq %r10,-72(%rdi)
+ shrdq $40,%r8,%r11
+ andq %rax,%r11
+ movq %r11,-64(%rdi)
+ movq 40(%rsi),%r9
+ movq %r8,%r11
+ shrq $5,%r11
+ andq %rax,%r11
+ movq %r11,-56(%rdi)
+ movq %r8,%r10
+ shrq $34,%r10
+ andq %rax,%r10
+ movq %r10,-48(%rdi)
+ shrdq $63,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-40(%rdi)
+ movq 48(%rsi),%r10
+ movq %r9,%r8
+ shrq $28,%r8
+ andq %rax,%r8
+ movq %r8,-32(%rdi)
+ shrdq $57,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-24(%rdi)
+ movq 56(%rsi),%r11
+ movq %r10,%r9
+ shrq $22,%r9
+ andq %rax,%r9
+ movq %r9,-16(%rdi)
+ shrdq $51,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-8(%rdi)
+ movq 64(%rsi),%r8
+ movq %r11,%r10
+ shrq $16,%r10
+ andq %rax,%r10
+ movq %r10,0(%rdi)
+ shrdq $45,%r8,%r11
+ andq %rax,%r11
+ movq %r11,8(%rdi)
+ movq 72(%rsi),%r9
+ movq %r8,%r11
+ shrq $10,%r11
+ andq %rax,%r11
+ movq %r11,16(%rdi)
+ shrdq $39,%r9,%r8
+ andq %rax,%r8
+ movq %r8,24(%rdi)
+ movq 80(%rsi),%r10
+ movq %r9,%r8
+ shrq $4,%r8
+ andq %rax,%r8
+ movq %r8,32(%rdi)
+ movq %r9,%r11
+ shrq $33,%r11
+ andq %rax,%r11
+ movq %r11,40(%rdi)
+ shrdq $62,%r10,%r9
+ andq %rax,%r9
+ movq %r9,48(%rdi)
+ movq 88(%rsi),%r11
+ movq %r10,%r9
+ shrq $27,%r9
+ andq %rax,%r9
+ movq %r9,56(%rdi)
+ shrdq $56,%r11,%r10
+ andq %rax,%r10
+ movq %r10,64(%rdi)
+ movq 96(%rsi),%r8
+ movq %r11,%r10
+ shrq $21,%r10
+ andq %rax,%r10
+ movq %r10,72(%rdi)
+ shrdq $50,%r8,%r11
+ andq %rax,%r11
+ movq %r11,80(%rdi)
+ movq 104(%rsi),%r9
+ movq %r8,%r11
+ shrq $15,%r11
+ andq %rax,%r11
+ movq %r11,88(%rdi)
+ shrdq $44,%r9,%r8
+ andq %rax,%r8
+ movq %r8,96(%rdi)
+ movq 112(%rsi),%r10
+ movq %r9,%r8
+ shrq $9,%r8
+ andq %rax,%r8
+ movq %r8,104(%rdi)
+ shrdq $38,%r10,%r9
+ andq %rax,%r9
+ movq %r9,112(%rdi)
+ movq 120(%rsi),%r11
+ movq %r10,%r9
+ shrq $3,%r9
+ andq %rax,%r9
+ movq %r9,120(%rdi)
+ movq %r10,%r8
+ shrq $32,%r8
+ andq %rax,%r8
+ movq %r8,128(%rdi)
+ shrdq $61,%r11,%r10
+ andq %rax,%r10
+ movq %r10,136(%rdi)
+ xorq %r8,%r8
+ movq %r11,%r10
+ shrq $26,%r10
+ andq %rax,%r10
+ movq %r10,144(%rdi)
+ shrdq $55,%r8,%r11
+ andq %rax,%r11
+ movq %r11,152(%rdi)
+ movq %r8,160(%rdi)
+ movq %r8,168(%rdi)
+ movq %r8,176(%rdi)
+ movq %r8,184(%rdi)
+ ret
+
+
+.globl _rsaz_1024_scatter5_avx2
+.private_extern _rsaz_1024_scatter5_avx2
+
+.p2align 5
+_rsaz_1024_scatter5_avx2:
+
+_CET_ENDBR
+ vzeroupper
+ vmovdqu L$scatter_permd(%rip),%ymm5
+ shll $4,%edx
+ leaq (%rdi,%rdx,1),%rdi
+ movl $9,%eax
+ jmp L$oop_scatter_1024
+
+.p2align 5
+L$oop_scatter_1024:
+ vmovdqu (%rsi),%ymm0
+ leaq 32(%rsi),%rsi
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,(%rdi)
+ leaq 512(%rdi),%rdi
+ decl %eax
+ jnz L$oop_scatter_1024
+
+ vzeroupper
+ ret
+
+
+
+.globl _rsaz_1024_gather5_avx2
+.private_extern _rsaz_1024_gather5_avx2
+
+.p2align 5
+_rsaz_1024_gather5_avx2:
+
+_CET_ENDBR
+ vzeroupper
+ movq %rsp,%r11
+
+ leaq -256(%rsp),%rsp
+ andq $-32,%rsp
+ leaq L$inc(%rip),%r10
+ leaq -128(%rsp),%rax
+
+ vmovd %edx,%xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,0+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm0
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,32+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm1
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,64+128(%rax)
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vmovdqa %ymm3,96+128(%rax)
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,128+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm8
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,160+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm9
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,192+128(%rax)
+ vpaddd %ymm5,%ymm8,%ymm10
+ vpcmpeqd %ymm4,%ymm8,%ymm8
+ vmovdqa %ymm3,224+128(%rax)
+ vpaddd %ymm5,%ymm9,%ymm11
+ vpcmpeqd %ymm4,%ymm9,%ymm9
+ vpaddd %ymm5,%ymm10,%ymm12
+ vpcmpeqd %ymm4,%ymm10,%ymm10
+ vpaddd %ymm5,%ymm11,%ymm13
+ vpcmpeqd %ymm4,%ymm11,%ymm11
+ vpaddd %ymm5,%ymm12,%ymm14
+ vpcmpeqd %ymm4,%ymm12,%ymm12
+ vpaddd %ymm5,%ymm13,%ymm15
+ vpcmpeqd %ymm4,%ymm13,%ymm13
+ vpcmpeqd %ymm4,%ymm14,%ymm14
+ vpcmpeqd %ymm4,%ymm15,%ymm15
+
+ vmovdqa -32(%r10),%ymm7
+ leaq 128(%rsi),%rsi
+ movl $9,%edx
+
+L$oop_gather_1024:
+ vmovdqa 0-128(%rsi),%ymm0
+ vmovdqa 32-128(%rsi),%ymm1
+ vmovdqa 64-128(%rsi),%ymm2
+ vmovdqa 96-128(%rsi),%ymm3
+ vpand 0+128(%rax),%ymm0,%ymm0
+ vpand 32+128(%rax),%ymm1,%ymm1
+ vpand 64+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm1,%ymm4
+ vpand 96+128(%rax),%ymm3,%ymm3
+ vmovdqa 128-128(%rsi),%ymm0
+ vmovdqa 160-128(%rsi),%ymm1
+ vpor %ymm2,%ymm3,%ymm5
+ vmovdqa 192-128(%rsi),%ymm2
+ vmovdqa 224-128(%rsi),%ymm3
+ vpand 128+128(%rax),%ymm0,%ymm0
+ vpand 160+128(%rax),%ymm1,%ymm1
+ vpand 192+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 224+128(%rax),%ymm3,%ymm3
+ vpand 256-128(%rsi),%ymm8,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 288-128(%rsi),%ymm9,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 320-128(%rsi),%ymm10,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 352-128(%rsi),%ymm11,%ymm3
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 384-128(%rsi),%ymm12,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 416-128(%rsi),%ymm13,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 448-128(%rsi),%ymm14,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 480-128(%rsi),%ymm15,%ymm3
+ leaq 512(%rsi),%rsi
+ vpor %ymm0,%ymm4,%ymm4
+ vpor %ymm1,%ymm5,%ymm5
+ vpor %ymm2,%ymm4,%ymm4
+ vpor %ymm3,%ymm5,%ymm5
+
+ vpor %ymm5,%ymm4,%ymm4
+ vextracti128 $1,%ymm4,%xmm5
+ vpor %xmm4,%xmm5,%xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,(%rdi)
+ leaq 32(%rdi),%rdi
+ decl %edx
+ jnz L$oop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ vzeroupper
+ leaq (%r11),%rsp
+
+ ret
+
+L$SEH_end_rsaz_1024_gather5:
+
+.section __DATA,__const
+.p2align 6
+L$and_mask:
+.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+L$scatter_permd:
+.long 0,2,4,6,7,7,7,7
+L$gather_permd:
+.long 0,7,1,7,2,7,3,7
+L$inc:
+.long 0,0,0,0, 1,1,1,1
+.long 2,2,2,2, 3,3,3,3
+.long 4,4,4,4, 4,4,4,4
+.p2align 6
+.text
+#endif
diff --git a/gen/bcm/rsaz-avx2-linux.S b/gen/bcm/rsaz-avx2-linux.S
new file mode 100644
index 0000000..65a6c2e
--- /dev/null
+++ b/gen/bcm/rsaz-avx2-linux.S
@@ -0,0 +1,1749 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.globl rsaz_1024_sqr_avx2
+.hidden rsaz_1024_sqr_avx2
+.type rsaz_1024_sqr_avx2,@function
+.align 64
+rsaz_1024_sqr_avx2:
+.cfi_startproc
+_CET_ENDBR
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ movq %rdx,%r13
+ subq $832,%rsp
+ movq %r13,%r15
+ subq $-128,%rdi
+ subq $-128,%rsi
+ subq $-128,%r13
+
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ vpxor %ymm9,%ymm9,%ymm9
+ jz .Lsqr_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%r13),%ymm0
+ andq $-2048,%rsp
+ vmovdqu 32-128(%r13),%ymm1
+ vmovdqu 64-128(%r13),%ymm2
+ vmovdqu 96-128(%r13),%ymm3
+ vmovdqu 128-128(%r13),%ymm4
+ vmovdqu 160-128(%r13),%ymm5
+ vmovdqu 192-128(%r13),%ymm6
+ vmovdqu 224-128(%r13),%ymm7
+ vmovdqu 256-128(%r13),%ymm8
+ leaq 832+128(%rsp),%r13
+ vmovdqu %ymm0,0-128(%r13)
+ vmovdqu %ymm1,32-128(%r13)
+ vmovdqu %ymm2,64-128(%r13)
+ vmovdqu %ymm3,96-128(%r13)
+ vmovdqu %ymm4,128-128(%r13)
+ vmovdqu %ymm5,160-128(%r13)
+ vmovdqu %ymm6,192-128(%r13)
+ vmovdqu %ymm7,224-128(%r13)
+ vmovdqu %ymm8,256-128(%r13)
+ vmovdqu %ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+ andq $-1024,%rsp
+
+ vmovdqu 32-128(%rsi),%ymm1
+ vmovdqu 64-128(%rsi),%ymm2
+ vmovdqu 96-128(%rsi),%ymm3
+ vmovdqu 128-128(%rsi),%ymm4
+ vmovdqu 160-128(%rsi),%ymm5
+ vmovdqu 192-128(%rsi),%ymm6
+ vmovdqu 224-128(%rsi),%ymm7
+ vmovdqu 256-128(%rsi),%ymm8
+
+ leaq 192(%rsp),%rbx
+ vmovdqu .Land_mask(%rip),%ymm15
+ jmp .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+ leaq 576+128(%rsp),%r9
+ leaq 448(%rsp),%r12
+
+
+
+
+ vpaddq %ymm1,%ymm1,%ymm1
+ vpbroadcastq 0-128(%rsi),%ymm10
+ vpaddq %ymm2,%ymm2,%ymm2
+ vmovdqa %ymm1,0-128(%r9)
+ vpaddq %ymm3,%ymm3,%ymm3
+ vmovdqa %ymm2,32-128(%r9)
+ vpaddq %ymm4,%ymm4,%ymm4
+ vmovdqa %ymm3,64-128(%r9)
+ vpaddq %ymm5,%ymm5,%ymm5
+ vmovdqa %ymm4,96-128(%r9)
+ vpaddq %ymm6,%ymm6,%ymm6
+ vmovdqa %ymm5,128-128(%r9)
+ vpaddq %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm6,160-128(%r9)
+ vpaddq %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm7,192-128(%r9)
+ vpxor %ymm9,%ymm9,%ymm9
+ vmovdqa %ymm8,224-128(%r9)
+
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpbroadcastq 32-128(%rsi),%ymm11
+ vmovdqu %ymm9,288-192(%rbx)
+ vpmuludq %ymm10,%ymm1,%ymm1
+ vmovdqu %ymm9,320-448(%r12)
+ vpmuludq %ymm10,%ymm2,%ymm2
+ vmovdqu %ymm9,352-448(%r12)
+ vpmuludq %ymm10,%ymm3,%ymm3
+ vmovdqu %ymm9,384-448(%r12)
+ vpmuludq %ymm10,%ymm4,%ymm4
+ vmovdqu %ymm9,416-448(%r12)
+ vpmuludq %ymm10,%ymm5,%ymm5
+ vmovdqu %ymm9,448-448(%r12)
+ vpmuludq %ymm10,%ymm6,%ymm6
+ vmovdqu %ymm9,480-448(%r12)
+ vpmuludq %ymm10,%ymm7,%ymm7
+ vmovdqu %ymm9,512-448(%r12)
+ vpmuludq %ymm10,%ymm8,%ymm8
+ vpbroadcastq 64-128(%rsi),%ymm10
+ vmovdqu %ymm9,544-448(%r12)
+
+ movq %rsi,%r15
+ movl $4,%r14d
+ jmp .Lsqr_entry_1024
+.align 32
+.LOOP_SQR_1024:
+ vpbroadcastq 32-128(%r15),%ymm11
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpaddq 0-192(%rbx),%ymm0,%ymm0
+ vpmuludq 0-128(%r9),%ymm10,%ymm1
+ vpaddq 32-192(%rbx),%ymm1,%ymm1
+ vpmuludq 32-128(%r9),%ymm10,%ymm2
+ vpaddq 64-192(%rbx),%ymm2,%ymm2
+ vpmuludq 64-128(%r9),%ymm10,%ymm3
+ vpaddq 96-192(%rbx),%ymm3,%ymm3
+ vpmuludq 96-128(%r9),%ymm10,%ymm4
+ vpaddq 128-192(%rbx),%ymm4,%ymm4
+ vpmuludq 128-128(%r9),%ymm10,%ymm5
+ vpaddq 160-192(%rbx),%ymm5,%ymm5
+ vpmuludq 160-128(%r9),%ymm10,%ymm6
+ vpaddq 192-192(%rbx),%ymm6,%ymm6
+ vpmuludq 192-128(%r9),%ymm10,%ymm7
+ vpaddq 224-192(%rbx),%ymm7,%ymm7
+ vpmuludq 224-128(%r9),%ymm10,%ymm8
+ vpbroadcastq 64-128(%r15),%ymm10
+ vpaddq 256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+ vmovdqu %ymm0,0-192(%rbx)
+ vmovdqu %ymm1,32-192(%rbx)
+
+ vpmuludq 32-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 32-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 64-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 96-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 128-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 160-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 192-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 224-128(%r9),%ymm11,%ymm0
+ vpbroadcastq 96-128(%r15),%ymm11
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+
+ vmovdqu %ymm2,64-192(%rbx)
+ vmovdqu %ymm3,96-192(%rbx)
+
+ vpmuludq 64-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 64-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 96-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 128-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 160-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 224-128(%r9),%ymm10,%ymm1
+ vpbroadcastq 128-128(%r15),%ymm10
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+
+ vmovdqu %ymm4,128-192(%rbx)
+ vmovdqu %ymm5,160-192(%rbx)
+
+ vpmuludq 96-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 96-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq 128-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm0,%ymm0
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq 224-128(%r9),%ymm11,%ymm2
+ vpbroadcastq 160-128(%r15),%ymm11
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+
+ vmovdqu %ymm6,192-192(%rbx)
+ vmovdqu %ymm7,224-192(%rbx)
+
+ vpmuludq 128-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 128-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 160-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 192-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 224-128(%r9),%ymm10,%ymm3
+ vpbroadcastq 192-128(%r15),%ymm10
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+
+ vmovdqu %ymm8,256-192(%rbx)
+ vmovdqu %ymm0,288-192(%rbx)
+ leaq 8(%rbx),%rbx
+
+ vpmuludq 160-128(%rsi),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 224-128(%r9),%ymm11,%ymm4
+ vpbroadcastq 224-128(%r15),%ymm11
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+
+ vmovdqu %ymm1,320-448(%r12)
+ vmovdqu %ymm2,352-448(%r12)
+
+ vpmuludq 192-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpbroadcastq 256-128(%r15),%ymm0
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq 224-128(%r9),%ymm10,%ymm5
+ vpbroadcastq 0+8-128(%r15),%ymm10
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+
+ vmovdqu %ymm3,384-448(%r12)
+ vmovdqu %ymm4,416-448(%r12)
+ leaq 8(%r15),%r15
+
+ vpmuludq 224-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 224-128(%r9),%ymm11,%ymm6
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+
+ vpmuludq 256-128(%rsi),%ymm0,%ymm7
+ vmovdqu %ymm5,448-448(%r12)
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vmovdqu %ymm6,480-448(%r12)
+ vmovdqu %ymm7,512-448(%r12)
+ leaq 8(%r12),%r12
+
+ decl %r14d
+ jnz .LOOP_SQR_1024
+
+ vmovdqu 256(%rsp),%ymm8
+ vmovdqu 288(%rsp),%ymm1
+ vmovdqu 320(%rsp),%ymm2
+ leaq 192(%rsp),%rbx
+
+ vpsrlq $29,%ymm8,%ymm14
+ vpand %ymm15,%ymm8,%ymm8
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+
+ vpermq $0x93,%ymm14,%ymm14
+ vpxor %ymm9,%ymm9,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpblendd $3,%ymm11,%ymm9,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,288-192(%rbx)
+ vmovdqu %ymm2,320-192(%rbx)
+
+ movq (%rsp),%rax
+ movq 8(%rsp),%r10
+ movq 16(%rsp),%r11
+ movq 24(%rsp),%r12
+ vmovdqu 32(%rsp),%ymm1
+ vmovdqu 64-192(%rbx),%ymm2
+ vmovdqu 96-192(%rbx),%ymm3
+ vmovdqu 128-192(%rbx),%ymm4
+ vmovdqu 160-192(%rbx),%ymm5
+ vmovdqu 192-192(%rbx),%ymm6
+ vmovdqu 224-192(%rbx),%ymm7
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpbroadcastq %xmm12,%ymm12
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ shrq $29,%r9
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ addq %r9,%r10
+ addq %rax,%r11
+ imulq 24-128(%r13),%rdx
+ addq %rdx,%r12
+
+ movq %r10,%rax
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+
+ movl $9,%r14d
+ jmp .LOOP_REDUCE_1024
+
+.align 32
+.LOOP_REDUCE_1024:
+ vmovd %eax,%xmm13
+ vpbroadcastq %xmm13,%ymm13
+
+ vpmuludq 32-128(%r13),%ymm12,%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm10,%ymm1,%ymm1
+ addq %rax,%r10
+ vpmuludq 64-128(%r13),%ymm12,%ymm14
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm14,%ymm2,%ymm2
+ vpmuludq 96-128(%r13),%ymm12,%ymm11
+.byte 0x67
+ addq %rax,%r11
+.byte 0x67
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ shrq $29,%r10
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpmuludq 128-128(%r13),%ymm12,%ymm10
+ addq %rax,%r12
+ addq %r10,%r11
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpmuludq 160-128(%r13),%ymm12,%ymm14
+ movq %r11,%rax
+ imull %ecx,%eax
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpmuludq 192-128(%r13),%ymm12,%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpmuludq 224-128(%r13),%ymm12,%ymm10
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpmuludq 256-128(%r13),%ymm12,%ymm14
+ vmovd %eax,%xmm12
+
+ vpaddq %ymm14,%ymm8,%ymm8
+
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 32-8-128(%r13),%ymm13,%ymm11
+ vmovdqu 96-8-128(%r13),%ymm14
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpmuludq 64-8-128(%r13),%ymm13,%ymm10
+ vmovdqu 128-8-128(%r13),%ymm11
+ addq %rax,%r11
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm10,%ymm2,%ymm2
+ addq %r12,%rax
+ shrq $29,%r11
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 160-8-128(%r13),%ymm10
+ addq %r11,%rax
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 192-8-128(%r13),%ymm14
+.byte 0x67
+ movq %rax,%r12
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpmuludq %ymm13,%ymm10,%ymm10
+.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm5,%ymm5
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 256-8-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 288-8-128(%r13),%ymm9
+ vmovd %eax,%xmm0
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpmuludq %ymm13,%ymm10,%ymm10
+ vmovdqu 32-16-128(%r13),%ymm14
+ vpbroadcastq %xmm0,%ymm0
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpmuludq %ymm13,%ymm9,%ymm9
+ vmovdqu 64-16-128(%r13),%ymm11
+ addq %rax,%r12
+
+ vmovdqu 32-24-128(%r13),%ymm13
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 96-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq %ymm0,%ymm13,%ymm13
+ vpmuludq %ymm12,%ymm11,%ymm11
+.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+ vpaddq %ymm1,%ymm13,%ymm13
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 160-16-128(%r13),%ymm11
+.byte 0x67
+ vmovq %xmm13,%rax
+ vmovdqu %ymm13,(%rsp)
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 192-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq %ymm12,%ymm11,%ymm11
+ vmovdqu 224-16-128(%r13),%ymm14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 256-16-128(%r13),%ymm11
+ vpaddq %ymm10,%ymm6,%ymm6
+ vpmuludq %ymm12,%ymm14,%ymm14
+ shrq $29,%r12
+ vmovdqu 288-16-128(%r13),%ymm10
+ addq %r12,%rax
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq %ymm12,%ymm11,%ymm11
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm8,%ymm8
+ vpmuludq %ymm12,%ymm10,%ymm10
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+ vmovdqu 96-24-128(%r13),%ymm11
+.byte 0x67
+ vpaddq %ymm10,%ymm9,%ymm9
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 64-24-128(%r13),%ymm0,%ymm14
+ vmovdqu 128-24-128(%r13),%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ movq 8(%rsp),%r10
+ vpaddq %ymm14,%ymm2,%ymm1
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 160-24-128(%r13),%ymm14
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+.byte 0x67
+ shrq $29,%r9
+ movq 16(%rsp),%r11
+ vpaddq %ymm11,%ymm3,%ymm2
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vmovdqu 192-24-128(%r13),%ymm11
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ vpaddq %ymm10,%ymm4,%ymm3
+ vpmuludq %ymm0,%ymm14,%ymm14
+ vmovdqu 224-24-128(%r13),%ymm10
+ imulq 24-128(%r13),%rdx
+ addq %rax,%r11
+ leaq (%r9,%r10,1),%rax
+ vpaddq %ymm14,%ymm5,%ymm4
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 256-24-128(%r13),%ymm14
+ movq %rax,%r10
+ imull %ecx,%eax
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vpaddq %ymm11,%ymm6,%ymm5
+ vmovdqu 288-24-128(%r13),%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm7,%ymm6
+ vpmuludq %ymm0,%ymm14,%ymm14
+ addq 24(%rsp),%rdx
+ vpaddq %ymm14,%ymm8,%ymm7
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vpaddq %ymm11,%ymm9,%ymm8
+ vmovq %r12,%xmm9
+ movq %rdx,%r12
+
+ decl %r14d
+ jnz .LOOP_REDUCE_1024
+ leaq 448(%rsp),%r12
+ vpaddq %ymm9,%ymm13,%ymm0
+ vpxor %ymm9,%ymm9,%ymm9
+
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vpaddq 544-448(%r12),%ymm8,%ymm8
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vmovdqu %ymm0,0-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,32-128(%rdi)
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vmovdqu %ymm2,64-128(%rdi)
+ vpaddq %ymm13,%ymm4,%ymm4
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vmovdqu %ymm4,128-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vmovdqu %ymm5,160-128(%rdi)
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vmovdqu %ymm6,192-128(%rdi)
+ vpaddq %ymm13,%ymm8,%ymm8
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+
+ movq %rdi,%rsi
+ decl %r8d
+ jne .LOOP_GRANDE_SQR_1024
+
+ vzeroall
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lsqr_1024_epilogue:
+ ret
+.cfi_endproc
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.hidden rsaz_1024_mul_avx2
+.type rsaz_1024_mul_avx2,@function
+.align 64
+rsaz_1024_mul_avx2:
+.cfi_startproc
+_CET_ENDBR
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ vzeroall
+ movq %rdx,%r13
+ subq $64,%rsp
+
+
+
+
+
+
+.byte 0x67,0x67
+ movq %rsi,%r15
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ movq %rsi,%r15
+ cmovnzq %r13,%rsi
+ cmovnzq %r15,%r13
+
+ movq %rcx,%r15
+ subq $-128,%rsi
+ subq $-128,%rcx
+ subq $-128,%rdi
+
+ andq $4095,%r15
+ addq $320,%r15
+.byte 0x67,0x67
+ shrq $12,%r15
+ jz .Lmul_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%rcx),%ymm0
+ andq $-512,%rsp
+ vmovdqu 32-128(%rcx),%ymm1
+ vmovdqu 64-128(%rcx),%ymm2
+ vmovdqu 96-128(%rcx),%ymm3
+ vmovdqu 128-128(%rcx),%ymm4
+ vmovdqu 160-128(%rcx),%ymm5
+ vmovdqu 192-128(%rcx),%ymm6
+ vmovdqu 224-128(%rcx),%ymm7
+ vmovdqu 256-128(%rcx),%ymm8
+ leaq 64+128(%rsp),%rcx
+ vmovdqu %ymm0,0-128(%rcx)
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm1,32-128(%rcx)
+ vpxor %ymm1,%ymm1,%ymm1
+ vmovdqu %ymm2,64-128(%rcx)
+ vpxor %ymm2,%ymm2,%ymm2
+ vmovdqu %ymm3,96-128(%rcx)
+ vpxor %ymm3,%ymm3,%ymm3
+ vmovdqu %ymm4,128-128(%rcx)
+ vpxor %ymm4,%ymm4,%ymm4
+ vmovdqu %ymm5,160-128(%rcx)
+ vpxor %ymm5,%ymm5,%ymm5
+ vmovdqu %ymm6,192-128(%rcx)
+ vpxor %ymm6,%ymm6,%ymm6
+ vmovdqu %ymm7,224-128(%rcx)
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqu %ymm8,256-128(%rcx)
+ vmovdqa %ymm0,%ymm8
+ vmovdqu %ymm9,288-128(%rcx)
+.Lmul_1024_no_n_copy:
+ andq $-64,%rsp
+
+ movq (%r13),%rbx
+ vpbroadcastq (%r13),%ymm10
+ vmovdqu %ymm0,(%rsp)
+ xorq %r9,%r9
+.byte 0x67
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+
+ vmovdqu .Land_mask(%rip),%ymm15
+ movl $9,%r14d
+ vmovdqu %ymm9,288-128(%rdi)
+ jmp .Loop_mul_1024
+
+.align 32
+.Loop_mul_1024:
+ vpsrlq $29,%ymm3,%ymm9
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r9,%rax
+ movq %rbx,%r10
+ imulq 8-128(%rsi),%r10
+ addq 8(%rsp),%r10
+
+ movq %rax,%r9
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ movq %rbx,%r11
+ imulq 16-128(%rsi),%r11
+ addq 16(%rsp),%r11
+
+ movq %rbx,%r12
+ imulq 24-128(%rsi),%r12
+ addq 24(%rsp),%r12
+ vpmuludq 32-128(%rsi),%ymm10,%ymm0
+ vmovd %eax,%xmm11
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq 64-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 96-128(%rsi),%ymm10,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq 128-128(%rsi),%ymm10,%ymm0
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq 160-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 192-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq 224-128(%rsi),%ymm10,%ymm0
+ vpermq $0x93,%ymm9,%ymm9
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq 256-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq 8(%r13),%ymm10
+ vpaddq %ymm12,%ymm8,%ymm8
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%rcx),%rax
+ addq %rax,%r11
+ shrq $29,%r9
+ imulq 24-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r9,%r10
+
+ vpmuludq 32-128(%rcx),%ymm11,%ymm13
+ vmovq %xmm10,%rbx
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 64-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm2,%ymm2
+ vpmuludq 96-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 128-128(%rcx),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 160-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm5,%ymm5
+ vpmuludq 192-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 224-128(%rcx),%ymm11,%ymm13
+ vpblendd $3,%ymm14,%ymm9,%ymm12
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 256-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm0,%ymm8,%ymm8
+
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rsi),%ymm12
+ movq %rbx,%rax
+ imulq 8-128(%rsi),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rsi),%ymm13
+
+ movq %r10,%rax
+ vpblendd $0xfc,%ymm14,%ymm9,%ymm9
+ imull %r8d,%eax
+ vpaddq %ymm9,%ymm4,%ymm4
+ andl $0x1fffffff,%eax
+
+ imulq 16-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovd %eax,%xmm11
+ vmovdqu -8+96-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -8+128-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+160-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+192-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -8+224-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+256-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+288-128(%rsi),%ymm9
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm9,%ymm9
+ vpbroadcastq 16(%r13),%ymm10
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rcx),%ymm0
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rcx),%ymm12
+ shrq $29,%r10
+ imulq 16-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r10,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -8+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rsi),%ymm0
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r11,%rax
+
+ vmovdqu -16+64-128(%rsi),%ymm12
+ movq %rax,%r11
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ imulq 8-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -16+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -16+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 24(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rcx),%ymm0
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -16+64-128(%rcx),%ymm12
+ imulq 8-128(%rcx),%rdx
+ addq %rdx,%r12
+ shrq $29,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -16+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+32-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+64-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ addq %r11,%r12
+ imulq -128(%rsi),%rbx
+ addq %rbx,%r12
+
+ movq %r12,%rax
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -24+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -24+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 32(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+ addq $32,%r13
+
+ vmovdqu -24+32-128(%rcx),%ymm0
+ imulq -128(%rcx),%rax
+ addq %rax,%r12
+ shrq $29,%r12
+
+ vmovdqu -24+64-128(%rcx),%ymm12
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -24+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm0
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu %ymm0,(%rsp)
+ vpaddq %ymm12,%ymm2,%ymm1
+ vmovdqu -24+128-128(%rcx),%ymm0
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm2
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm3
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm4
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm5
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rcx),%ymm13
+ movq %r12,%r9
+ vpaddq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm11,%ymm12,%ymm12
+ addq (%rsp),%r9
+ vpaddq %ymm12,%ymm8,%ymm7
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovq %r12,%xmm12
+ vpaddq %ymm13,%ymm9,%ymm8
+
+ decl %r14d
+ jnz .Loop_mul_1024
+ vpaddq (%rsp),%ymm12,%ymm0
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm10,%ymm10
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpermq $0x93,%ymm11,%ymm11
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm10,%ymm10
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vmovdqu %ymm0,0-128(%rdi)
+ vmovdqu %ymm1,32-128(%rdi)
+ vmovdqu %ymm2,64-128(%rdi)
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vmovdqu %ymm4,128-128(%rdi)
+ vmovdqu %ymm5,160-128(%rdi)
+ vmovdqu %ymm6,192-128(%rdi)
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+ vzeroupper
+
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_1024_epilogue:
+ ret
+.cfi_endproc
+.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+.globl rsaz_1024_red2norm_avx2
+.hidden rsaz_1024_red2norm_avx2
+.type rsaz_1024_red2norm_avx2,@function
+.align 32
+rsaz_1024_red2norm_avx2:
+.cfi_startproc
+_CET_ENDBR
+ subq $-128,%rsi
+ xorq %rax,%rax
+ movq -128(%rsi),%r8
+ movq -120(%rsi),%r9
+ movq -112(%rsi),%r10
+ shlq $0,%r8
+ shlq $29,%r9
+ movq %r10,%r11
+ shlq $58,%r10
+ shrq $6,%r11
+ addq %r8,%rax
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,0(%rdi)
+ movq %r11,%rax
+ movq -104(%rsi),%r8
+ movq -96(%rsi),%r9
+ shlq $23,%r8
+ movq %r9,%r10
+ shlq $52,%r9
+ shrq $12,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,8(%rdi)
+ movq %r10,%rax
+ movq -88(%rsi),%r11
+ movq -80(%rsi),%r8
+ shlq $17,%r11
+ movq %r8,%r9
+ shlq $46,%r8
+ shrq $18,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,16(%rdi)
+ movq %r9,%rax
+ movq -72(%rsi),%r10
+ movq -64(%rsi),%r11
+ shlq $11,%r10
+ movq %r11,%r8
+ shlq $40,%r11
+ shrq $24,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,24(%rdi)
+ movq %r8,%rax
+ movq -56(%rsi),%r9
+ movq -48(%rsi),%r10
+ movq -40(%rsi),%r11
+ shlq $5,%r9
+ shlq $34,%r10
+ movq %r11,%r8
+ shlq $63,%r11
+ shrq $1,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,32(%rdi)
+ movq %r8,%rax
+ movq -32(%rsi),%r9
+ movq -24(%rsi),%r10
+ shlq $28,%r9
+ movq %r10,%r11
+ shlq $57,%r10
+ shrq $7,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,40(%rdi)
+ movq %r11,%rax
+ movq -16(%rsi),%r8
+ movq -8(%rsi),%r9
+ shlq $22,%r8
+ movq %r9,%r10
+ shlq $51,%r9
+ shrq $13,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,48(%rdi)
+ movq %r10,%rax
+ movq 0(%rsi),%r11
+ movq 8(%rsi),%r8
+ shlq $16,%r11
+ movq %r8,%r9
+ shlq $45,%r8
+ shrq $19,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,56(%rdi)
+ movq %r9,%rax
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ shlq $10,%r10
+ movq %r11,%r8
+ shlq $39,%r11
+ shrq $25,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,64(%rdi)
+ movq %r8,%rax
+ movq 32(%rsi),%r9
+ movq 40(%rsi),%r10
+ movq 48(%rsi),%r11
+ shlq $4,%r9
+ shlq $33,%r10
+ movq %r11,%r8
+ shlq $62,%r11
+ shrq $2,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,72(%rdi)
+ movq %r8,%rax
+ movq 56(%rsi),%r9
+ movq 64(%rsi),%r10
+ shlq $27,%r9
+ movq %r10,%r11
+ shlq $56,%r10
+ shrq $8,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,80(%rdi)
+ movq %r11,%rax
+ movq 72(%rsi),%r8
+ movq 80(%rsi),%r9
+ shlq $21,%r8
+ movq %r9,%r10
+ shlq $50,%r9
+ shrq $14,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,88(%rdi)
+ movq %r10,%rax
+ movq 88(%rsi),%r11
+ movq 96(%rsi),%r8
+ shlq $15,%r11
+ movq %r8,%r9
+ shlq $44,%r8
+ shrq $20,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,96(%rdi)
+ movq %r9,%rax
+ movq 104(%rsi),%r10
+ movq 112(%rsi),%r11
+ shlq $9,%r10
+ movq %r11,%r8
+ shlq $38,%r11
+ shrq $26,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,104(%rdi)
+ movq %r8,%rax
+ movq 120(%rsi),%r9
+ movq 128(%rsi),%r10
+ movq 136(%rsi),%r11
+ shlq $3,%r9
+ shlq $32,%r10
+ movq %r11,%r8
+ shlq $61,%r11
+ shrq $3,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,112(%rdi)
+ movq %r8,%rax
+ movq 144(%rsi),%r9
+ movq 152(%rsi),%r10
+ shlq $26,%r9
+ movq %r10,%r11
+ shlq $55,%r10
+ shrq $9,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,120(%rdi)
+ movq %r11,%rax
+ ret
+.cfi_endproc
+.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl rsaz_1024_norm2red_avx2
+.hidden rsaz_1024_norm2red_avx2
+.type rsaz_1024_norm2red_avx2,@function
+.align 32
+rsaz_1024_norm2red_avx2:
+.cfi_startproc
+_CET_ENDBR
+ subq $-128,%rdi
+ movq (%rsi),%r8
+ movl $0x1fffffff,%eax
+ movq 8(%rsi),%r9
+ movq %r8,%r11
+ shrq $0,%r11
+ andq %rax,%r11
+ movq %r11,-128(%rdi)
+ movq %r8,%r10
+ shrq $29,%r10
+ andq %rax,%r10
+ movq %r10,-120(%rdi)
+ shrdq $58,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-112(%rdi)
+ movq 16(%rsi),%r10
+ movq %r9,%r8
+ shrq $23,%r8
+ andq %rax,%r8
+ movq %r8,-104(%rdi)
+ shrdq $52,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-96(%rdi)
+ movq 24(%rsi),%r11
+ movq %r10,%r9
+ shrq $17,%r9
+ andq %rax,%r9
+ movq %r9,-88(%rdi)
+ shrdq $46,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-80(%rdi)
+ movq 32(%rsi),%r8
+ movq %r11,%r10
+ shrq $11,%r10
+ andq %rax,%r10
+ movq %r10,-72(%rdi)
+ shrdq $40,%r8,%r11
+ andq %rax,%r11
+ movq %r11,-64(%rdi)
+ movq 40(%rsi),%r9
+ movq %r8,%r11
+ shrq $5,%r11
+ andq %rax,%r11
+ movq %r11,-56(%rdi)
+ movq %r8,%r10
+ shrq $34,%r10
+ andq %rax,%r10
+ movq %r10,-48(%rdi)
+ shrdq $63,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-40(%rdi)
+ movq 48(%rsi),%r10
+ movq %r9,%r8
+ shrq $28,%r8
+ andq %rax,%r8
+ movq %r8,-32(%rdi)
+ shrdq $57,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-24(%rdi)
+ movq 56(%rsi),%r11
+ movq %r10,%r9
+ shrq $22,%r9
+ andq %rax,%r9
+ movq %r9,-16(%rdi)
+ shrdq $51,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-8(%rdi)
+ movq 64(%rsi),%r8
+ movq %r11,%r10
+ shrq $16,%r10
+ andq %rax,%r10
+ movq %r10,0(%rdi)
+ shrdq $45,%r8,%r11
+ andq %rax,%r11
+ movq %r11,8(%rdi)
+ movq 72(%rsi),%r9
+ movq %r8,%r11
+ shrq $10,%r11
+ andq %rax,%r11
+ movq %r11,16(%rdi)
+ shrdq $39,%r9,%r8
+ andq %rax,%r8
+ movq %r8,24(%rdi)
+ movq 80(%rsi),%r10
+ movq %r9,%r8
+ shrq $4,%r8
+ andq %rax,%r8
+ movq %r8,32(%rdi)
+ movq %r9,%r11
+ shrq $33,%r11
+ andq %rax,%r11
+ movq %r11,40(%rdi)
+ shrdq $62,%r10,%r9
+ andq %rax,%r9
+ movq %r9,48(%rdi)
+ movq 88(%rsi),%r11
+ movq %r10,%r9
+ shrq $27,%r9
+ andq %rax,%r9
+ movq %r9,56(%rdi)
+ shrdq $56,%r11,%r10
+ andq %rax,%r10
+ movq %r10,64(%rdi)
+ movq 96(%rsi),%r8
+ movq %r11,%r10
+ shrq $21,%r10
+ andq %rax,%r10
+ movq %r10,72(%rdi)
+ shrdq $50,%r8,%r11
+ andq %rax,%r11
+ movq %r11,80(%rdi)
+ movq 104(%rsi),%r9
+ movq %r8,%r11
+ shrq $15,%r11
+ andq %rax,%r11
+ movq %r11,88(%rdi)
+ shrdq $44,%r9,%r8
+ andq %rax,%r8
+ movq %r8,96(%rdi)
+ movq 112(%rsi),%r10
+ movq %r9,%r8
+ shrq $9,%r8
+ andq %rax,%r8
+ movq %r8,104(%rdi)
+ shrdq $38,%r10,%r9
+ andq %rax,%r9
+ movq %r9,112(%rdi)
+ movq 120(%rsi),%r11
+ movq %r10,%r9
+ shrq $3,%r9
+ andq %rax,%r9
+ movq %r9,120(%rdi)
+ movq %r10,%r8
+ shrq $32,%r8
+ andq %rax,%r8
+ movq %r8,128(%rdi)
+ shrdq $61,%r11,%r10
+ andq %rax,%r10
+ movq %r10,136(%rdi)
+ xorq %r8,%r8
+ movq %r11,%r10
+ shrq $26,%r10
+ andq %rax,%r10
+ movq %r10,144(%rdi)
+ shrdq $55,%r8,%r11
+ andq %rax,%r11
+ movq %r11,152(%rdi)
+ movq %r8,160(%rdi)
+ movq %r8,168(%rdi)
+ movq %r8,176(%rdi)
+ movq %r8,184(%rdi)
+ ret
+.cfi_endproc
+.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+.globl rsaz_1024_scatter5_avx2
+.hidden rsaz_1024_scatter5_avx2
+.type rsaz_1024_scatter5_avx2,@function
+.align 32
+rsaz_1024_scatter5_avx2:
+.cfi_startproc
+_CET_ENDBR
+ vzeroupper
+ vmovdqu .Lscatter_permd(%rip),%ymm5
+ shll $4,%edx
+ leaq (%rdi,%rdx,1),%rdi
+ movl $9,%eax
+ jmp .Loop_scatter_1024
+
+.align 32
+.Loop_scatter_1024:
+ vmovdqu (%rsi),%ymm0
+ leaq 32(%rsi),%rsi
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,(%rdi)
+ leaq 512(%rdi),%rdi
+ decl %eax
+ jnz .Loop_scatter_1024
+
+ vzeroupper
+ ret
+.cfi_endproc
+.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl rsaz_1024_gather5_avx2
+.hidden rsaz_1024_gather5_avx2
+.type rsaz_1024_gather5_avx2,@function
+.align 32
+rsaz_1024_gather5_avx2:
+.cfi_startproc
+_CET_ENDBR
+ vzeroupper
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ leaq -256(%rsp),%rsp
+ andq $-32,%rsp
+ leaq .Linc(%rip),%r10
+ leaq -128(%rsp),%rax
+
+ vmovd %edx,%xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,0+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm0
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,32+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm1
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,64+128(%rax)
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vmovdqa %ymm3,96+128(%rax)
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,128+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm8
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,160+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm9
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,192+128(%rax)
+ vpaddd %ymm5,%ymm8,%ymm10
+ vpcmpeqd %ymm4,%ymm8,%ymm8
+ vmovdqa %ymm3,224+128(%rax)
+ vpaddd %ymm5,%ymm9,%ymm11
+ vpcmpeqd %ymm4,%ymm9,%ymm9
+ vpaddd %ymm5,%ymm10,%ymm12
+ vpcmpeqd %ymm4,%ymm10,%ymm10
+ vpaddd %ymm5,%ymm11,%ymm13
+ vpcmpeqd %ymm4,%ymm11,%ymm11
+ vpaddd %ymm5,%ymm12,%ymm14
+ vpcmpeqd %ymm4,%ymm12,%ymm12
+ vpaddd %ymm5,%ymm13,%ymm15
+ vpcmpeqd %ymm4,%ymm13,%ymm13
+ vpcmpeqd %ymm4,%ymm14,%ymm14
+ vpcmpeqd %ymm4,%ymm15,%ymm15
+
+ vmovdqa -32(%r10),%ymm7
+ leaq 128(%rsi),%rsi
+ movl $9,%edx
+
+.Loop_gather_1024:
+ vmovdqa 0-128(%rsi),%ymm0
+ vmovdqa 32-128(%rsi),%ymm1
+ vmovdqa 64-128(%rsi),%ymm2
+ vmovdqa 96-128(%rsi),%ymm3
+ vpand 0+128(%rax),%ymm0,%ymm0
+ vpand 32+128(%rax),%ymm1,%ymm1
+ vpand 64+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm1,%ymm4
+ vpand 96+128(%rax),%ymm3,%ymm3
+ vmovdqa 128-128(%rsi),%ymm0
+ vmovdqa 160-128(%rsi),%ymm1
+ vpor %ymm2,%ymm3,%ymm5
+ vmovdqa 192-128(%rsi),%ymm2
+ vmovdqa 224-128(%rsi),%ymm3
+ vpand 128+128(%rax),%ymm0,%ymm0
+ vpand 160+128(%rax),%ymm1,%ymm1
+ vpand 192+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 224+128(%rax),%ymm3,%ymm3
+ vpand 256-128(%rsi),%ymm8,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 288-128(%rsi),%ymm9,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 320-128(%rsi),%ymm10,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 352-128(%rsi),%ymm11,%ymm3
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 384-128(%rsi),%ymm12,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 416-128(%rsi),%ymm13,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 448-128(%rsi),%ymm14,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 480-128(%rsi),%ymm15,%ymm3
+ leaq 512(%rsi),%rsi
+ vpor %ymm0,%ymm4,%ymm4
+ vpor %ymm1,%ymm5,%ymm5
+ vpor %ymm2,%ymm4,%ymm4
+ vpor %ymm3,%ymm5,%ymm5
+
+ vpor %ymm5,%ymm4,%ymm4
+ vextracti128 $1,%ymm4,%xmm5
+ vpor %xmm4,%xmm5,%xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,(%rdi)
+ leaq 32(%rdi),%rdi
+ decl %edx
+ jnz .Loop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ vzeroupper
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+ ret
+.cfi_endproc
+.LSEH_end_rsaz_1024_gather5:
+.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+.section .rodata
+.align 64
+.Land_mask:
+.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+.long 0,2,4,6,7,7,7,7
+.Lgather_permd:
+.long 0,7,1,7,2,7,3,7
+.Linc:
+.long 0,0,0,0, 1,1,1,1
+.long 2,2,2,2, 3,3,3,3
+.long 4,4,4,4, 4,4,4,4
+.align 64
+.text
+#endif
diff --git a/gen/bcm/rsaz-avx2-win.asm b/gen/bcm/rsaz-avx2-win.asm
new file mode 100644
index 0000000..beadbdd
--- /dev/null
+++ b/gen/bcm/rsaz-avx2-win.asm
@@ -0,0 +1,1987 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+global rsaz_1024_sqr_avx2
+
+ALIGN 64
+rsaz_1024_sqr_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_rsaz_1024_sqr_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ lea rax,[rsp]
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ vzeroupper
+ lea rsp,[((-168))+rsp]
+ vmovaps XMMWORD[(-216)+rax],xmm6
+ vmovaps XMMWORD[(-200)+rax],xmm7
+ vmovaps XMMWORD[(-184)+rax],xmm8
+ vmovaps XMMWORD[(-168)+rax],xmm9
+ vmovaps XMMWORD[(-152)+rax],xmm10
+ vmovaps XMMWORD[(-136)+rax],xmm11
+ vmovaps XMMWORD[(-120)+rax],xmm12
+ vmovaps XMMWORD[(-104)+rax],xmm13
+ vmovaps XMMWORD[(-88)+rax],xmm14
+ vmovaps XMMWORD[(-72)+rax],xmm15
+$L$sqr_1024_body:
+ mov rbp,rax
+
+ mov r13,rdx
+ sub rsp,832
+ mov r15,r13
+ sub rdi,-128
+ sub rsi,-128
+ sub r13,-128
+
+ and r15,4095
+ add r15,32*10
+ shr r15,12
+ vpxor ymm9,ymm9,ymm9
+ jz NEAR $L$sqr_1024_no_n_copy
+
+
+
+
+
+ sub rsp,32*10
+ vmovdqu ymm0,YMMWORD[((0-128))+r13]
+ and rsp,-2048
+ vmovdqu ymm1,YMMWORD[((32-128))+r13]
+ vmovdqu ymm2,YMMWORD[((64-128))+r13]
+ vmovdqu ymm3,YMMWORD[((96-128))+r13]
+ vmovdqu ymm4,YMMWORD[((128-128))+r13]
+ vmovdqu ymm5,YMMWORD[((160-128))+r13]
+ vmovdqu ymm6,YMMWORD[((192-128))+r13]
+ vmovdqu ymm7,YMMWORD[((224-128))+r13]
+ vmovdqu ymm8,YMMWORD[((256-128))+r13]
+ lea r13,[((832+128))+rsp]
+ vmovdqu YMMWORD[(0-128)+r13],ymm0
+ vmovdqu YMMWORD[(32-128)+r13],ymm1
+ vmovdqu YMMWORD[(64-128)+r13],ymm2
+ vmovdqu YMMWORD[(96-128)+r13],ymm3
+ vmovdqu YMMWORD[(128-128)+r13],ymm4
+ vmovdqu YMMWORD[(160-128)+r13],ymm5
+ vmovdqu YMMWORD[(192-128)+r13],ymm6
+ vmovdqu YMMWORD[(224-128)+r13],ymm7
+ vmovdqu YMMWORD[(256-128)+r13],ymm8
+ vmovdqu YMMWORD[(288-128)+r13],ymm9
+
+$L$sqr_1024_no_n_copy:
+ and rsp,-1024
+
+ vmovdqu ymm1,YMMWORD[((32-128))+rsi]
+ vmovdqu ymm2,YMMWORD[((64-128))+rsi]
+ vmovdqu ymm3,YMMWORD[((96-128))+rsi]
+ vmovdqu ymm4,YMMWORD[((128-128))+rsi]
+ vmovdqu ymm5,YMMWORD[((160-128))+rsi]
+ vmovdqu ymm6,YMMWORD[((192-128))+rsi]
+ vmovdqu ymm7,YMMWORD[((224-128))+rsi]
+ vmovdqu ymm8,YMMWORD[((256-128))+rsi]
+
+ lea rbx,[192+rsp]
+ vmovdqu ymm15,YMMWORD[$L$and_mask]
+ jmp NEAR $L$OOP_GRANDE_SQR_1024
+
+ALIGN 32
+$L$OOP_GRANDE_SQR_1024:
+ lea r9,[((576+128))+rsp]
+ lea r12,[448+rsp]
+
+
+
+
+ vpaddq ymm1,ymm1,ymm1
+ vpbroadcastq ymm10,QWORD[((0-128))+rsi]
+ vpaddq ymm2,ymm2,ymm2
+ vmovdqa YMMWORD[(0-128)+r9],ymm1
+ vpaddq ymm3,ymm3,ymm3
+ vmovdqa YMMWORD[(32-128)+r9],ymm2
+ vpaddq ymm4,ymm4,ymm4
+ vmovdqa YMMWORD[(64-128)+r9],ymm3
+ vpaddq ymm5,ymm5,ymm5
+ vmovdqa YMMWORD[(96-128)+r9],ymm4
+ vpaddq ymm6,ymm6,ymm6
+ vmovdqa YMMWORD[(128-128)+r9],ymm5
+ vpaddq ymm7,ymm7,ymm7
+ vmovdqa YMMWORD[(160-128)+r9],ymm6
+ vpaddq ymm8,ymm8,ymm8
+ vmovdqa YMMWORD[(192-128)+r9],ymm7
+ vpxor ymm9,ymm9,ymm9
+ vmovdqa YMMWORD[(224-128)+r9],ymm8
+
+ vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi]
+ vpbroadcastq ymm11,QWORD[((32-128))+rsi]
+ vmovdqu YMMWORD[(288-192)+rbx],ymm9
+ vpmuludq ymm1,ymm1,ymm10
+ vmovdqu YMMWORD[(320-448)+r12],ymm9
+ vpmuludq ymm2,ymm2,ymm10
+ vmovdqu YMMWORD[(352-448)+r12],ymm9
+ vpmuludq ymm3,ymm3,ymm10
+ vmovdqu YMMWORD[(384-448)+r12],ymm9
+ vpmuludq ymm4,ymm4,ymm10
+ vmovdqu YMMWORD[(416-448)+r12],ymm9
+ vpmuludq ymm5,ymm5,ymm10
+ vmovdqu YMMWORD[(448-448)+r12],ymm9
+ vpmuludq ymm6,ymm6,ymm10
+ vmovdqu YMMWORD[(480-448)+r12],ymm9
+ vpmuludq ymm7,ymm7,ymm10
+ vmovdqu YMMWORD[(512-448)+r12],ymm9
+ vpmuludq ymm8,ymm8,ymm10
+ vpbroadcastq ymm10,QWORD[((64-128))+rsi]
+ vmovdqu YMMWORD[(544-448)+r12],ymm9
+
+ mov r15,rsi
+ mov r14d,4
+ jmp NEAR $L$sqr_entry_1024
+ALIGN 32
+$L$OOP_SQR_1024:
+ vpbroadcastq ymm11,QWORD[((32-128))+r15]
+ vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi]
+ vpaddq ymm0,ymm0,YMMWORD[((0-192))+rbx]
+ vpmuludq ymm1,ymm10,YMMWORD[((0-128))+r9]
+ vpaddq ymm1,ymm1,YMMWORD[((32-192))+rbx]
+ vpmuludq ymm2,ymm10,YMMWORD[((32-128))+r9]
+ vpaddq ymm2,ymm2,YMMWORD[((64-192))+rbx]
+ vpmuludq ymm3,ymm10,YMMWORD[((64-128))+r9]
+ vpaddq ymm3,ymm3,YMMWORD[((96-192))+rbx]
+ vpmuludq ymm4,ymm10,YMMWORD[((96-128))+r9]
+ vpaddq ymm4,ymm4,YMMWORD[((128-192))+rbx]
+ vpmuludq ymm5,ymm10,YMMWORD[((128-128))+r9]
+ vpaddq ymm5,ymm5,YMMWORD[((160-192))+rbx]
+ vpmuludq ymm6,ymm10,YMMWORD[((160-128))+r9]
+ vpaddq ymm6,ymm6,YMMWORD[((192-192))+rbx]
+ vpmuludq ymm7,ymm10,YMMWORD[((192-128))+r9]
+ vpaddq ymm7,ymm7,YMMWORD[((224-192))+rbx]
+ vpmuludq ymm8,ymm10,YMMWORD[((224-128))+r9]
+ vpbroadcastq ymm10,QWORD[((64-128))+r15]
+ vpaddq ymm8,ymm8,YMMWORD[((256-192))+rbx]
+$L$sqr_entry_1024:
+ vmovdqu YMMWORD[(0-192)+rbx],ymm0
+ vmovdqu YMMWORD[(32-192)+rbx],ymm1
+
+ vpmuludq ymm12,ymm11,YMMWORD[((32-128))+rsi]
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm14,ymm11,YMMWORD[((32-128))+r9]
+ vpaddq ymm3,ymm3,ymm14
+ vpmuludq ymm13,ymm11,YMMWORD[((64-128))+r9]
+ vpaddq ymm4,ymm4,ymm13
+ vpmuludq ymm12,ymm11,YMMWORD[((96-128))+r9]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm14,ymm11,YMMWORD[((128-128))+r9]
+ vpaddq ymm6,ymm6,ymm14
+ vpmuludq ymm13,ymm11,YMMWORD[((160-128))+r9]
+ vpaddq ymm7,ymm7,ymm13
+ vpmuludq ymm12,ymm11,YMMWORD[((192-128))+r9]
+ vpaddq ymm8,ymm8,ymm12
+ vpmuludq ymm0,ymm11,YMMWORD[((224-128))+r9]
+ vpbroadcastq ymm11,QWORD[((96-128))+r15]
+ vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx]
+
+ vmovdqu YMMWORD[(64-192)+rbx],ymm2
+ vmovdqu YMMWORD[(96-192)+rbx],ymm3
+
+ vpmuludq ymm13,ymm10,YMMWORD[((64-128))+rsi]
+ vpaddq ymm4,ymm4,ymm13
+ vpmuludq ymm12,ymm10,YMMWORD[((64-128))+r9]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm14,ymm10,YMMWORD[((96-128))+r9]
+ vpaddq ymm6,ymm6,ymm14
+ vpmuludq ymm13,ymm10,YMMWORD[((128-128))+r9]
+ vpaddq ymm7,ymm7,ymm13
+ vpmuludq ymm12,ymm10,YMMWORD[((160-128))+r9]
+ vpaddq ymm8,ymm8,ymm12
+ vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9]
+ vpaddq ymm0,ymm0,ymm14
+ vpmuludq ymm1,ymm10,YMMWORD[((224-128))+r9]
+ vpbroadcastq ymm10,QWORD[((128-128))+r15]
+ vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12]
+
+ vmovdqu YMMWORD[(128-192)+rbx],ymm4
+ vmovdqu YMMWORD[(160-192)+rbx],ymm5
+
+ vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rsi]
+ vpaddq ymm6,ymm6,ymm12
+ vpmuludq ymm14,ymm11,YMMWORD[((96-128))+r9]
+ vpaddq ymm7,ymm7,ymm14
+ vpmuludq ymm13,ymm11,YMMWORD[((128-128))+r9]
+ vpaddq ymm8,ymm8,ymm13
+ vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9]
+ vpaddq ymm0,ymm0,ymm12
+ vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9]
+ vpaddq ymm1,ymm1,ymm14
+ vpmuludq ymm2,ymm11,YMMWORD[((224-128))+r9]
+ vpbroadcastq ymm11,QWORD[((160-128))+r15]
+ vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12]
+
+ vmovdqu YMMWORD[(192-192)+rbx],ymm6
+ vmovdqu YMMWORD[(224-192)+rbx],ymm7
+
+ vpmuludq ymm12,ymm10,YMMWORD[((128-128))+rsi]
+ vpaddq ymm8,ymm8,ymm12
+ vpmuludq ymm14,ymm10,YMMWORD[((128-128))+r9]
+ vpaddq ymm0,ymm0,ymm14
+ vpmuludq ymm13,ymm10,YMMWORD[((160-128))+r9]
+ vpaddq ymm1,ymm1,ymm13
+ vpmuludq ymm12,ymm10,YMMWORD[((192-128))+r9]
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm3,ymm10,YMMWORD[((224-128))+r9]
+ vpbroadcastq ymm10,QWORD[((192-128))+r15]
+ vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12]
+
+ vmovdqu YMMWORD[(256-192)+rbx],ymm8
+ vmovdqu YMMWORD[(288-192)+rbx],ymm0
+ lea rbx,[8+rbx]
+
+ vpmuludq ymm13,ymm11,YMMWORD[((160-128))+rsi]
+ vpaddq ymm1,ymm1,ymm13
+ vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9]
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9]
+ vpaddq ymm3,ymm3,ymm14
+ vpmuludq ymm4,ymm11,YMMWORD[((224-128))+r9]
+ vpbroadcastq ymm11,QWORD[((224-128))+r15]
+ vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12]
+
+ vmovdqu YMMWORD[(320-448)+r12],ymm1
+ vmovdqu YMMWORD[(352-448)+r12],ymm2
+
+ vpmuludq ymm12,ymm10,YMMWORD[((192-128))+rsi]
+ vpaddq ymm3,ymm3,ymm12
+ vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9]
+ vpbroadcastq ymm0,QWORD[((256-128))+r15]
+ vpaddq ymm4,ymm4,ymm14
+ vpmuludq ymm5,ymm10,YMMWORD[((224-128))+r9]
+ vpbroadcastq ymm10,QWORD[((0+8-128))+r15]
+ vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12]
+
+ vmovdqu YMMWORD[(384-448)+r12],ymm3
+ vmovdqu YMMWORD[(416-448)+r12],ymm4
+ lea r15,[8+r15]
+
+ vpmuludq ymm12,ymm11,YMMWORD[((224-128))+rsi]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm6,ymm11,YMMWORD[((224-128))+r9]
+ vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12]
+
+ vpmuludq ymm7,ymm0,YMMWORD[((256-128))+rsi]
+ vmovdqu YMMWORD[(448-448)+r12],ymm5
+ vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12]
+ vmovdqu YMMWORD[(480-448)+r12],ymm6
+ vmovdqu YMMWORD[(512-448)+r12],ymm7
+ lea r12,[8+r12]
+
+ dec r14d
+ jnz NEAR $L$OOP_SQR_1024
+
+ vmovdqu ymm8,YMMWORD[256+rsp]
+ vmovdqu ymm1,YMMWORD[288+rsp]
+ vmovdqu ymm2,YMMWORD[320+rsp]
+ lea rbx,[192+rsp]
+
+ vpsrlq ymm14,ymm8,29
+ vpand ymm8,ymm8,ymm15
+ vpsrlq ymm11,ymm1,29
+ vpand ymm1,ymm1,ymm15
+
+ vpermq ymm14,ymm14,0x93
+ vpxor ymm9,ymm9,ymm9
+ vpermq ymm11,ymm11,0x93
+
+ vpblendd ymm10,ymm14,ymm9,3
+ vpblendd ymm14,ymm11,ymm14,3
+ vpaddq ymm8,ymm8,ymm10
+ vpblendd ymm11,ymm9,ymm11,3
+ vpaddq ymm1,ymm1,ymm14
+ vpaddq ymm2,ymm2,ymm11
+ vmovdqu YMMWORD[(288-192)+rbx],ymm1
+ vmovdqu YMMWORD[(320-192)+rbx],ymm2
+
+ mov rax,QWORD[rsp]
+ mov r10,QWORD[8+rsp]
+ mov r11,QWORD[16+rsp]
+ mov r12,QWORD[24+rsp]
+ vmovdqu ymm1,YMMWORD[32+rsp]
+ vmovdqu ymm2,YMMWORD[((64-192))+rbx]
+ vmovdqu ymm3,YMMWORD[((96-192))+rbx]
+ vmovdqu ymm4,YMMWORD[((128-192))+rbx]
+ vmovdqu ymm5,YMMWORD[((160-192))+rbx]
+ vmovdqu ymm6,YMMWORD[((192-192))+rbx]
+ vmovdqu ymm7,YMMWORD[((224-192))+rbx]
+
+ mov r9,rax
+ imul eax,ecx
+ and eax,0x1fffffff
+ vmovd xmm12,eax
+
+ mov rdx,rax
+ imul rax,QWORD[((-128))+r13]
+ vpbroadcastq ymm12,xmm12
+ add r9,rax
+ mov rax,rdx
+ imul rax,QWORD[((8-128))+r13]
+ shr r9,29
+ add r10,rax
+ mov rax,rdx
+ imul rax,QWORD[((16-128))+r13]
+ add r10,r9
+ add r11,rax
+ imul rdx,QWORD[((24-128))+r13]
+ add r12,rdx
+
+ mov rax,r10
+ imul eax,ecx
+ and eax,0x1fffffff
+
+ mov r14d,9
+ jmp NEAR $L$OOP_REDUCE_1024
+
+ALIGN 32
+$L$OOP_REDUCE_1024:
+ vmovd xmm13,eax
+ vpbroadcastq ymm13,xmm13
+
+ vpmuludq ymm10,ymm12,YMMWORD[((32-128))+r13]
+ mov rdx,rax
+ imul rax,QWORD[((-128))+r13]
+ vpaddq ymm1,ymm1,ymm10
+ add r10,rax
+ vpmuludq ymm14,ymm12,YMMWORD[((64-128))+r13]
+ mov rax,rdx
+ imul rax,QWORD[((8-128))+r13]
+ vpaddq ymm2,ymm2,ymm14
+ vpmuludq ymm11,ymm12,YMMWORD[((96-128))+r13]
+ DB 0x67
+ add r11,rax
+ DB 0x67
+ mov rax,rdx
+ imul rax,QWORD[((16-128))+r13]
+ shr r10,29
+ vpaddq ymm3,ymm3,ymm11
+ vpmuludq ymm10,ymm12,YMMWORD[((128-128))+r13]
+ add r12,rax
+ add r11,r10
+ vpaddq ymm4,ymm4,ymm10
+ vpmuludq ymm14,ymm12,YMMWORD[((160-128))+r13]
+ mov rax,r11
+ imul eax,ecx
+ vpaddq ymm5,ymm5,ymm14
+ vpmuludq ymm11,ymm12,YMMWORD[((192-128))+r13]
+ and eax,0x1fffffff
+ vpaddq ymm6,ymm6,ymm11
+ vpmuludq ymm10,ymm12,YMMWORD[((224-128))+r13]
+ vpaddq ymm7,ymm7,ymm10
+ vpmuludq ymm14,ymm12,YMMWORD[((256-128))+r13]
+ vmovd xmm12,eax
+
+ vpaddq ymm8,ymm8,ymm14
+
+ vpbroadcastq ymm12,xmm12
+
+ vpmuludq ymm11,ymm13,YMMWORD[((32-8-128))+r13]
+ vmovdqu ymm14,YMMWORD[((96-8-128))+r13]
+ mov rdx,rax
+ imul rax,QWORD[((-128))+r13]
+ vpaddq ymm1,ymm1,ymm11
+ vpmuludq ymm10,ymm13,YMMWORD[((64-8-128))+r13]
+ vmovdqu ymm11,YMMWORD[((128-8-128))+r13]
+ add r11,rax
+ mov rax,rdx
+ imul rax,QWORD[((8-128))+r13]
+ vpaddq ymm2,ymm2,ymm10
+ add rax,r12
+ shr r11,29
+ vpmuludq ymm14,ymm14,ymm13
+ vmovdqu ymm10,YMMWORD[((160-8-128))+r13]
+ add rax,r11
+ vpaddq ymm3,ymm3,ymm14
+ vpmuludq ymm11,ymm11,ymm13
+ vmovdqu ymm14,YMMWORD[((192-8-128))+r13]
+ DB 0x67
+ mov r12,rax
+ imul eax,ecx
+ vpaddq ymm4,ymm4,ymm11
+ vpmuludq ymm10,ymm10,ymm13
+ DB 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+ and eax,0x1fffffff
+ vpaddq ymm5,ymm5,ymm10
+ vpmuludq ymm14,ymm14,ymm13
+ vmovdqu ymm10,YMMWORD[((256-8-128))+r13]
+ vpaddq ymm6,ymm6,ymm14
+ vpmuludq ymm11,ymm11,ymm13
+ vmovdqu ymm9,YMMWORD[((288-8-128))+r13]
+ vmovd xmm0,eax
+ imul rax,QWORD[((-128))+r13]
+ vpaddq ymm7,ymm7,ymm11
+ vpmuludq ymm10,ymm10,ymm13
+ vmovdqu ymm14,YMMWORD[((32-16-128))+r13]
+ vpbroadcastq ymm0,xmm0
+ vpaddq ymm8,ymm8,ymm10
+ vpmuludq ymm9,ymm9,ymm13
+ vmovdqu ymm11,YMMWORD[((64-16-128))+r13]
+ add r12,rax
+
+ vmovdqu ymm13,YMMWORD[((32-24-128))+r13]
+ vpmuludq ymm14,ymm14,ymm12
+ vmovdqu ymm10,YMMWORD[((96-16-128))+r13]
+ vpaddq ymm1,ymm1,ymm14
+ vpmuludq ymm13,ymm13,ymm0
+ vpmuludq ymm11,ymm11,ymm12
+ DB 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+ vpaddq ymm13,ymm13,ymm1
+ vpaddq ymm2,ymm2,ymm11
+ vpmuludq ymm10,ymm10,ymm12
+ vmovdqu ymm11,YMMWORD[((160-16-128))+r13]
+ DB 0x67
+ vmovq rax,xmm13
+ vmovdqu YMMWORD[rsp],ymm13
+ vpaddq ymm3,ymm3,ymm10
+ vpmuludq ymm14,ymm14,ymm12
+ vmovdqu ymm10,YMMWORD[((192-16-128))+r13]
+ vpaddq ymm4,ymm4,ymm14
+ vpmuludq ymm11,ymm11,ymm12
+ vmovdqu ymm14,YMMWORD[((224-16-128))+r13]
+ vpaddq ymm5,ymm5,ymm11
+ vpmuludq ymm10,ymm10,ymm12
+ vmovdqu ymm11,YMMWORD[((256-16-128))+r13]
+ vpaddq ymm6,ymm6,ymm10
+ vpmuludq ymm14,ymm14,ymm12
+ shr r12,29
+ vmovdqu ymm10,YMMWORD[((288-16-128))+r13]
+ add rax,r12
+ vpaddq ymm7,ymm7,ymm14
+ vpmuludq ymm11,ymm11,ymm12
+
+ mov r9,rax
+ imul eax,ecx
+ vpaddq ymm8,ymm8,ymm11
+ vpmuludq ymm10,ymm10,ymm12
+ and eax,0x1fffffff
+ vmovd xmm12,eax
+ vmovdqu ymm11,YMMWORD[((96-24-128))+r13]
+ DB 0x67
+ vpaddq ymm9,ymm9,ymm10
+ vpbroadcastq ymm12,xmm12
+
+ vpmuludq ymm14,ymm0,YMMWORD[((64-24-128))+r13]
+ vmovdqu ymm10,YMMWORD[((128-24-128))+r13]
+ mov rdx,rax
+ imul rax,QWORD[((-128))+r13]
+ mov r10,QWORD[8+rsp]
+ vpaddq ymm1,ymm2,ymm14
+ vpmuludq ymm11,ymm11,ymm0
+ vmovdqu ymm14,YMMWORD[((160-24-128))+r13]
+ add r9,rax
+ mov rax,rdx
+ imul rax,QWORD[((8-128))+r13]
+ DB 0x67
+ shr r9,29
+ mov r11,QWORD[16+rsp]
+ vpaddq ymm2,ymm3,ymm11
+ vpmuludq ymm10,ymm10,ymm0
+ vmovdqu ymm11,YMMWORD[((192-24-128))+r13]
+ add r10,rax
+ mov rax,rdx
+ imul rax,QWORD[((16-128))+r13]
+ vpaddq ymm3,ymm4,ymm10
+ vpmuludq ymm14,ymm14,ymm0
+ vmovdqu ymm10,YMMWORD[((224-24-128))+r13]
+ imul rdx,QWORD[((24-128))+r13]
+ add r11,rax
+ lea rax,[r10*1+r9]
+ vpaddq ymm4,ymm5,ymm14
+ vpmuludq ymm11,ymm11,ymm0
+ vmovdqu ymm14,YMMWORD[((256-24-128))+r13]
+ mov r10,rax
+ imul eax,ecx
+ vpmuludq ymm10,ymm10,ymm0
+ vpaddq ymm5,ymm6,ymm11
+ vmovdqu ymm11,YMMWORD[((288-24-128))+r13]
+ and eax,0x1fffffff
+ vpaddq ymm6,ymm7,ymm10
+ vpmuludq ymm14,ymm14,ymm0
+ add rdx,QWORD[24+rsp]
+ vpaddq ymm7,ymm8,ymm14
+ vpmuludq ymm11,ymm11,ymm0
+ vpaddq ymm8,ymm9,ymm11
+ vmovq xmm9,r12
+ mov r12,rdx
+
+ dec r14d
+ jnz NEAR $L$OOP_REDUCE_1024
+ lea r12,[448+rsp]
+ vpaddq ymm0,ymm13,ymm9
+ vpxor ymm9,ymm9,ymm9
+
+ vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx]
+ vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12]
+ vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12]
+ vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12]
+ vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12]
+ vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12]
+ vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12]
+ vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12]
+ vpaddq ymm8,ymm8,YMMWORD[((544-448))+r12]
+
+ vpsrlq ymm14,ymm0,29
+ vpand ymm0,ymm0,ymm15
+ vpsrlq ymm11,ymm1,29
+ vpand ymm1,ymm1,ymm15
+ vpsrlq ymm12,ymm2,29
+ vpermq ymm14,ymm14,0x93
+ vpand ymm2,ymm2,ymm15
+ vpsrlq ymm13,ymm3,29
+ vpermq ymm11,ymm11,0x93
+ vpand ymm3,ymm3,ymm15
+ vpermq ymm12,ymm12,0x93
+
+ vpblendd ymm10,ymm14,ymm9,3
+ vpermq ymm13,ymm13,0x93
+ vpblendd ymm14,ymm11,ymm14,3
+ vpaddq ymm0,ymm0,ymm10
+ vpblendd ymm11,ymm12,ymm11,3
+ vpaddq ymm1,ymm1,ymm14
+ vpblendd ymm12,ymm13,ymm12,3
+ vpaddq ymm2,ymm2,ymm11
+ vpblendd ymm13,ymm9,ymm13,3
+ vpaddq ymm3,ymm3,ymm12
+ vpaddq ymm4,ymm4,ymm13
+
+ vpsrlq ymm14,ymm0,29
+ vpand ymm0,ymm0,ymm15
+ vpsrlq ymm11,ymm1,29
+ vpand ymm1,ymm1,ymm15
+ vpsrlq ymm12,ymm2,29
+ vpermq ymm14,ymm14,0x93
+ vpand ymm2,ymm2,ymm15
+ vpsrlq ymm13,ymm3,29
+ vpermq ymm11,ymm11,0x93
+ vpand ymm3,ymm3,ymm15
+ vpermq ymm12,ymm12,0x93
+
+ vpblendd ymm10,ymm14,ymm9,3
+ vpermq ymm13,ymm13,0x93
+ vpblendd ymm14,ymm11,ymm14,3
+ vpaddq ymm0,ymm0,ymm10
+ vpblendd ymm11,ymm12,ymm11,3
+ vpaddq ymm1,ymm1,ymm14
+ vmovdqu YMMWORD[(0-128)+rdi],ymm0
+ vpblendd ymm12,ymm13,ymm12,3
+ vpaddq ymm2,ymm2,ymm11
+ vmovdqu YMMWORD[(32-128)+rdi],ymm1
+ vpblendd ymm13,ymm9,ymm13,3
+ vpaddq ymm3,ymm3,ymm12
+ vmovdqu YMMWORD[(64-128)+rdi],ymm2
+ vpaddq ymm4,ymm4,ymm13
+ vmovdqu YMMWORD[(96-128)+rdi],ymm3
+ vpsrlq ymm14,ymm4,29
+ vpand ymm4,ymm4,ymm15
+ vpsrlq ymm11,ymm5,29
+ vpand ymm5,ymm5,ymm15
+ vpsrlq ymm12,ymm6,29
+ vpermq ymm14,ymm14,0x93
+ vpand ymm6,ymm6,ymm15
+ vpsrlq ymm13,ymm7,29
+ vpermq ymm11,ymm11,0x93
+ vpand ymm7,ymm7,ymm15
+ vpsrlq ymm0,ymm8,29
+ vpermq ymm12,ymm12,0x93
+ vpand ymm8,ymm8,ymm15
+ vpermq ymm13,ymm13,0x93
+
+ vpblendd ymm10,ymm14,ymm9,3
+ vpermq ymm0,ymm0,0x93
+ vpblendd ymm14,ymm11,ymm14,3
+ vpaddq ymm4,ymm4,ymm10
+ vpblendd ymm11,ymm12,ymm11,3
+ vpaddq ymm5,ymm5,ymm14
+ vpblendd ymm12,ymm13,ymm12,3
+ vpaddq ymm6,ymm6,ymm11
+ vpblendd ymm13,ymm0,ymm13,3
+ vpaddq ymm7,ymm7,ymm12
+ vpaddq ymm8,ymm8,ymm13
+
+ vpsrlq ymm14,ymm4,29
+ vpand ymm4,ymm4,ymm15
+ vpsrlq ymm11,ymm5,29
+ vpand ymm5,ymm5,ymm15
+ vpsrlq ymm12,ymm6,29
+ vpermq ymm14,ymm14,0x93
+ vpand ymm6,ymm6,ymm15
+ vpsrlq ymm13,ymm7,29
+ vpermq ymm11,ymm11,0x93
+ vpand ymm7,ymm7,ymm15
+ vpsrlq ymm0,ymm8,29
+ vpermq ymm12,ymm12,0x93
+ vpand ymm8,ymm8,ymm15
+ vpermq ymm13,ymm13,0x93
+
+ vpblendd ymm10,ymm14,ymm9,3
+ vpermq ymm0,ymm0,0x93
+ vpblendd ymm14,ymm11,ymm14,3
+ vpaddq ymm4,ymm4,ymm10
+ vpblendd ymm11,ymm12,ymm11,3
+ vpaddq ymm5,ymm5,ymm14
+ vmovdqu YMMWORD[(128-128)+rdi],ymm4
+ vpblendd ymm12,ymm13,ymm12,3
+ vpaddq ymm6,ymm6,ymm11
+ vmovdqu YMMWORD[(160-128)+rdi],ymm5
+ vpblendd ymm13,ymm0,ymm13,3
+ vpaddq ymm7,ymm7,ymm12
+ vmovdqu YMMWORD[(192-128)+rdi],ymm6
+ vpaddq ymm8,ymm8,ymm13
+ vmovdqu YMMWORD[(224-128)+rdi],ymm7
+ vmovdqu YMMWORD[(256-128)+rdi],ymm8
+
+ mov rsi,rdi
+ dec r8d
+ jne NEAR $L$OOP_GRANDE_SQR_1024
+
+ vzeroall
+ mov rax,rbp
+
+$L$sqr_1024_in_tail:
+ movaps xmm6,XMMWORD[((-216))+rax]
+ movaps xmm7,XMMWORD[((-200))+rax]
+ movaps xmm8,XMMWORD[((-184))+rax]
+ movaps xmm9,XMMWORD[((-168))+rax]
+ movaps xmm10,XMMWORD[((-152))+rax]
+ movaps xmm11,XMMWORD[((-136))+rax]
+ movaps xmm12,XMMWORD[((-120))+rax]
+ movaps xmm13,XMMWORD[((-104))+rax]
+ movaps xmm14,XMMWORD[((-88))+rax]
+ movaps xmm15,XMMWORD[((-72))+rax]
+ mov r15,QWORD[((-48))+rax]
+
+ mov r14,QWORD[((-40))+rax]
+
+ mov r13,QWORD[((-32))+rax]
+
+ mov r12,QWORD[((-24))+rax]
+
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$sqr_1024_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_rsaz_1024_sqr_avx2:
+global rsaz_1024_mul_avx2
+
+ALIGN 64
+rsaz_1024_mul_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_rsaz_1024_mul_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ lea rax,[rsp]
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ vzeroupper
+ lea rsp,[((-168))+rsp]
+ vmovaps XMMWORD[(-216)+rax],xmm6
+ vmovaps XMMWORD[(-200)+rax],xmm7
+ vmovaps XMMWORD[(-184)+rax],xmm8
+ vmovaps XMMWORD[(-168)+rax],xmm9
+ vmovaps XMMWORD[(-152)+rax],xmm10
+ vmovaps XMMWORD[(-136)+rax],xmm11
+ vmovaps XMMWORD[(-120)+rax],xmm12
+ vmovaps XMMWORD[(-104)+rax],xmm13
+ vmovaps XMMWORD[(-88)+rax],xmm14
+ vmovaps XMMWORD[(-72)+rax],xmm15
+$L$mul_1024_body:
+ mov rbp,rax
+
+ vzeroall
+ mov r13,rdx
+ sub rsp,64
+
+
+
+
+
+
+ DB 0x67,0x67
+ mov r15,rsi
+ and r15,4095
+ add r15,32*10
+ shr r15,12
+ mov r15,rsi
+ cmovnz rsi,r13
+ cmovnz r13,r15
+
+ mov r15,rcx
+ sub rsi,-128
+ sub rcx,-128
+ sub rdi,-128
+
+ and r15,4095
+ add r15,32*10
+ DB 0x67,0x67
+ shr r15,12
+ jz NEAR $L$mul_1024_no_n_copy
+
+
+
+
+
+ sub rsp,32*10
+ vmovdqu ymm0,YMMWORD[((0-128))+rcx]
+ and rsp,-512
+ vmovdqu ymm1,YMMWORD[((32-128))+rcx]
+ vmovdqu ymm2,YMMWORD[((64-128))+rcx]
+ vmovdqu ymm3,YMMWORD[((96-128))+rcx]
+ vmovdqu ymm4,YMMWORD[((128-128))+rcx]
+ vmovdqu ymm5,YMMWORD[((160-128))+rcx]
+ vmovdqu ymm6,YMMWORD[((192-128))+rcx]
+ vmovdqu ymm7,YMMWORD[((224-128))+rcx]
+ vmovdqu ymm8,YMMWORD[((256-128))+rcx]
+ lea rcx,[((64+128))+rsp]
+ vmovdqu YMMWORD[(0-128)+rcx],ymm0
+ vpxor ymm0,ymm0,ymm0
+ vmovdqu YMMWORD[(32-128)+rcx],ymm1
+ vpxor ymm1,ymm1,ymm1
+ vmovdqu YMMWORD[(64-128)+rcx],ymm2
+ vpxor ymm2,ymm2,ymm2
+ vmovdqu YMMWORD[(96-128)+rcx],ymm3
+ vpxor ymm3,ymm3,ymm3
+ vmovdqu YMMWORD[(128-128)+rcx],ymm4
+ vpxor ymm4,ymm4,ymm4
+ vmovdqu YMMWORD[(160-128)+rcx],ymm5
+ vpxor ymm5,ymm5,ymm5
+ vmovdqu YMMWORD[(192-128)+rcx],ymm6
+ vpxor ymm6,ymm6,ymm6
+ vmovdqu YMMWORD[(224-128)+rcx],ymm7
+ vpxor ymm7,ymm7,ymm7
+ vmovdqu YMMWORD[(256-128)+rcx],ymm8
+ vmovdqa ymm8,ymm0
+ vmovdqu YMMWORD[(288-128)+rcx],ymm9
+$L$mul_1024_no_n_copy:
+ and rsp,-64
+
+ mov rbx,QWORD[r13]
+ vpbroadcastq ymm10,QWORD[r13]
+ vmovdqu YMMWORD[rsp],ymm0
+ xor r9,r9
+ DB 0x67
+ xor r10,r10
+ xor r11,r11
+ xor r12,r12
+
+ vmovdqu ymm15,YMMWORD[$L$and_mask]
+ mov r14d,9
+ vmovdqu YMMWORD[(288-128)+rdi],ymm9
+ jmp NEAR $L$oop_mul_1024
+
+ALIGN 32
+$L$oop_mul_1024:
+ vpsrlq ymm9,ymm3,29
+ mov rax,rbx
+ imul rax,QWORD[((-128))+rsi]
+ add rax,r9
+ mov r10,rbx
+ imul r10,QWORD[((8-128))+rsi]
+ add r10,QWORD[8+rsp]
+
+ mov r9,rax
+ imul eax,r8d
+ and eax,0x1fffffff
+
+ mov r11,rbx
+ imul r11,QWORD[((16-128))+rsi]
+ add r11,QWORD[16+rsp]
+
+ mov r12,rbx
+ imul r12,QWORD[((24-128))+rsi]
+ add r12,QWORD[24+rsp]
+ vpmuludq ymm0,ymm10,YMMWORD[((32-128))+rsi]
+ vmovd xmm11,eax
+ vpaddq ymm1,ymm1,ymm0
+ vpmuludq ymm12,ymm10,YMMWORD[((64-128))+rsi]
+ vpbroadcastq ymm11,xmm11
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm13,ymm10,YMMWORD[((96-128))+rsi]
+ vpand ymm3,ymm3,ymm15
+ vpaddq ymm3,ymm3,ymm13
+ vpmuludq ymm0,ymm10,YMMWORD[((128-128))+rsi]
+ vpaddq ymm4,ymm4,ymm0
+ vpmuludq ymm12,ymm10,YMMWORD[((160-128))+rsi]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm13,ymm10,YMMWORD[((192-128))+rsi]
+ vpaddq ymm6,ymm6,ymm13
+ vpmuludq ymm0,ymm10,YMMWORD[((224-128))+rsi]
+ vpermq ymm9,ymm9,0x93
+ vpaddq ymm7,ymm7,ymm0
+ vpmuludq ymm12,ymm10,YMMWORD[((256-128))+rsi]
+ vpbroadcastq ymm10,QWORD[8+r13]
+ vpaddq ymm8,ymm8,ymm12
+
+ mov rdx,rax
+ imul rax,QWORD[((-128))+rcx]
+ add r9,rax
+ mov rax,rdx
+ imul rax,QWORD[((8-128))+rcx]
+ add r10,rax
+ mov rax,rdx
+ imul rax,QWORD[((16-128))+rcx]
+ add r11,rax
+ shr r9,29
+ imul rdx,QWORD[((24-128))+rcx]
+ add r12,rdx
+ add r10,r9
+
+ vpmuludq ymm13,ymm11,YMMWORD[((32-128))+rcx]
+ vmovq rbx,xmm10
+ vpaddq ymm1,ymm1,ymm13
+ vpmuludq ymm0,ymm11,YMMWORD[((64-128))+rcx]
+ vpaddq ymm2,ymm2,ymm0
+ vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rcx]
+ vpaddq ymm3,ymm3,ymm12
+ vpmuludq ymm13,ymm11,YMMWORD[((128-128))+rcx]
+ vpaddq ymm4,ymm4,ymm13
+ vpmuludq ymm0,ymm11,YMMWORD[((160-128))+rcx]
+ vpaddq ymm5,ymm5,ymm0
+ vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx]
+ vpaddq ymm6,ymm6,ymm12
+ vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx]
+ vpblendd ymm12,ymm9,ymm14,3
+ vpaddq ymm7,ymm7,ymm13
+ vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx]
+ vpaddq ymm3,ymm3,ymm12
+ vpaddq ymm8,ymm8,ymm0
+
+ mov rax,rbx
+ imul rax,QWORD[((-128))+rsi]
+ add r10,rax
+ vmovdqu ymm12,YMMWORD[((-8+32-128))+rsi]
+ mov rax,rbx
+ imul rax,QWORD[((8-128))+rsi]
+ add r11,rax
+ vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi]
+
+ mov rax,r10
+ vpblendd ymm9,ymm9,ymm14,0xfc
+ imul eax,r8d
+ vpaddq ymm4,ymm4,ymm9
+ and eax,0x1fffffff
+
+ imul rbx,QWORD[((16-128))+rsi]
+ add r12,rbx
+ vpmuludq ymm12,ymm12,ymm10
+ vmovd xmm11,eax
+ vmovdqu ymm0,YMMWORD[((-8+96-128))+rsi]
+ vpaddq ymm1,ymm1,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vpbroadcastq ymm11,xmm11
+ vmovdqu ymm12,YMMWORD[((-8+128-128))+rsi]
+ vpaddq ymm2,ymm2,ymm13
+ vpmuludq ymm0,ymm0,ymm10
+ vmovdqu ymm13,YMMWORD[((-8+160-128))+rsi]
+ vpaddq ymm3,ymm3,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vmovdqu ymm0,YMMWORD[((-8+192-128))+rsi]
+ vpaddq ymm4,ymm4,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vmovdqu ymm12,YMMWORD[((-8+224-128))+rsi]
+ vpaddq ymm5,ymm5,ymm13
+ vpmuludq ymm0,ymm0,ymm10
+ vmovdqu ymm13,YMMWORD[((-8+256-128))+rsi]
+ vpaddq ymm6,ymm6,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vmovdqu ymm9,YMMWORD[((-8+288-128))+rsi]
+ vpaddq ymm7,ymm7,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vpaddq ymm8,ymm8,ymm13
+ vpmuludq ymm9,ymm9,ymm10
+ vpbroadcastq ymm10,QWORD[16+r13]
+
+ mov rdx,rax
+ imul rax,QWORD[((-128))+rcx]
+ add r10,rax
+ vmovdqu ymm0,YMMWORD[((-8+32-128))+rcx]
+ mov rax,rdx
+ imul rax,QWORD[((8-128))+rcx]
+ add r11,rax
+ vmovdqu ymm12,YMMWORD[((-8+64-128))+rcx]
+ shr r10,29
+ imul rdx,QWORD[((16-128))+rcx]
+ add r12,rdx
+ add r11,r10
+
+ vpmuludq ymm0,ymm0,ymm11
+ vmovq rbx,xmm10
+ vmovdqu ymm13,YMMWORD[((-8+96-128))+rcx]
+ vpaddq ymm1,ymm1,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vmovdqu ymm0,YMMWORD[((-8+128-128))+rcx]
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vmovdqu ymm12,YMMWORD[((-8+160-128))+rcx]
+ vpaddq ymm3,ymm3,ymm13
+ vpmuludq ymm0,ymm0,ymm11
+ vmovdqu ymm13,YMMWORD[((-8+192-128))+rcx]
+ vpaddq ymm4,ymm4,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vmovdqu ymm0,YMMWORD[((-8+224-128))+rcx]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vmovdqu ymm12,YMMWORD[((-8+256-128))+rcx]
+ vpaddq ymm6,ymm6,ymm13
+ vpmuludq ymm0,ymm0,ymm11
+ vmovdqu ymm13,YMMWORD[((-8+288-128))+rcx]
+ vpaddq ymm7,ymm7,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vpaddq ymm8,ymm8,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vpaddq ymm9,ymm9,ymm13
+
+ vmovdqu ymm0,YMMWORD[((-16+32-128))+rsi]
+ mov rax,rbx
+ imul rax,QWORD[((-128))+rsi]
+ add rax,r11
+
+ vmovdqu ymm12,YMMWORD[((-16+64-128))+rsi]
+ mov r11,rax
+ imul eax,r8d
+ and eax,0x1fffffff
+
+ imul rbx,QWORD[((8-128))+rsi]
+ add r12,rbx
+ vpmuludq ymm0,ymm0,ymm10
+ vmovd xmm11,eax
+ vmovdqu ymm13,YMMWORD[((-16+96-128))+rsi]
+ vpaddq ymm1,ymm1,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vpbroadcastq ymm11,xmm11
+ vmovdqu ymm0,YMMWORD[((-16+128-128))+rsi]
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vmovdqu ymm12,YMMWORD[((-16+160-128))+rsi]
+ vpaddq ymm3,ymm3,ymm13
+ vpmuludq ymm0,ymm0,ymm10
+ vmovdqu ymm13,YMMWORD[((-16+192-128))+rsi]
+ vpaddq ymm4,ymm4,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vmovdqu ymm0,YMMWORD[((-16+224-128))+rsi]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vmovdqu ymm12,YMMWORD[((-16+256-128))+rsi]
+ vpaddq ymm6,ymm6,ymm13
+ vpmuludq ymm0,ymm0,ymm10
+ vmovdqu ymm13,YMMWORD[((-16+288-128))+rsi]
+ vpaddq ymm7,ymm7,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vpaddq ymm8,ymm8,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vpbroadcastq ymm10,QWORD[24+r13]
+ vpaddq ymm9,ymm9,ymm13
+
+ vmovdqu ymm0,YMMWORD[((-16+32-128))+rcx]
+ mov rdx,rax
+ imul rax,QWORD[((-128))+rcx]
+ add r11,rax
+ vmovdqu ymm12,YMMWORD[((-16+64-128))+rcx]
+ imul rdx,QWORD[((8-128))+rcx]
+ add r12,rdx
+ shr r11,29
+
+ vpmuludq ymm0,ymm0,ymm11
+ vmovq rbx,xmm10
+ vmovdqu ymm13,YMMWORD[((-16+96-128))+rcx]
+ vpaddq ymm1,ymm1,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vmovdqu ymm0,YMMWORD[((-16+128-128))+rcx]
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vmovdqu ymm12,YMMWORD[((-16+160-128))+rcx]
+ vpaddq ymm3,ymm3,ymm13
+ vpmuludq ymm0,ymm0,ymm11
+ vmovdqu ymm13,YMMWORD[((-16+192-128))+rcx]
+ vpaddq ymm4,ymm4,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vmovdqu ymm0,YMMWORD[((-16+224-128))+rcx]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vmovdqu ymm12,YMMWORD[((-16+256-128))+rcx]
+ vpaddq ymm6,ymm6,ymm13
+ vpmuludq ymm0,ymm0,ymm11
+ vmovdqu ymm13,YMMWORD[((-16+288-128))+rcx]
+ vpaddq ymm7,ymm7,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vmovdqu ymm0,YMMWORD[((-24+32-128))+rsi]
+ vpaddq ymm8,ymm8,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vmovdqu ymm12,YMMWORD[((-24+64-128))+rsi]
+ vpaddq ymm9,ymm9,ymm13
+
+ add r12,r11
+ imul rbx,QWORD[((-128))+rsi]
+ add r12,rbx
+
+ mov rax,r12
+ imul eax,r8d
+ and eax,0x1fffffff
+
+ vpmuludq ymm0,ymm0,ymm10
+ vmovd xmm11,eax
+ vmovdqu ymm13,YMMWORD[((-24+96-128))+rsi]
+ vpaddq ymm1,ymm1,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vpbroadcastq ymm11,xmm11
+ vmovdqu ymm0,YMMWORD[((-24+128-128))+rsi]
+ vpaddq ymm2,ymm2,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vmovdqu ymm12,YMMWORD[((-24+160-128))+rsi]
+ vpaddq ymm3,ymm3,ymm13
+ vpmuludq ymm0,ymm0,ymm10
+ vmovdqu ymm13,YMMWORD[((-24+192-128))+rsi]
+ vpaddq ymm4,ymm4,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vmovdqu ymm0,YMMWORD[((-24+224-128))+rsi]
+ vpaddq ymm5,ymm5,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vmovdqu ymm12,YMMWORD[((-24+256-128))+rsi]
+ vpaddq ymm6,ymm6,ymm13
+ vpmuludq ymm0,ymm0,ymm10
+ vmovdqu ymm13,YMMWORD[((-24+288-128))+rsi]
+ vpaddq ymm7,ymm7,ymm0
+ vpmuludq ymm12,ymm12,ymm10
+ vpaddq ymm8,ymm8,ymm12
+ vpmuludq ymm13,ymm13,ymm10
+ vpbroadcastq ymm10,QWORD[32+r13]
+ vpaddq ymm9,ymm9,ymm13
+ add r13,32
+
+ vmovdqu ymm0,YMMWORD[((-24+32-128))+rcx]
+ imul rax,QWORD[((-128))+rcx]
+ add r12,rax
+ shr r12,29
+
+ vmovdqu ymm12,YMMWORD[((-24+64-128))+rcx]
+ vpmuludq ymm0,ymm0,ymm11
+ vmovq rbx,xmm10
+ vmovdqu ymm13,YMMWORD[((-24+96-128))+rcx]
+ vpaddq ymm0,ymm1,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vmovdqu YMMWORD[rsp],ymm0
+ vpaddq ymm1,ymm2,ymm12
+ vmovdqu ymm0,YMMWORD[((-24+128-128))+rcx]
+ vpmuludq ymm13,ymm13,ymm11
+ vmovdqu ymm12,YMMWORD[((-24+160-128))+rcx]
+ vpaddq ymm2,ymm3,ymm13
+ vpmuludq ymm0,ymm0,ymm11
+ vmovdqu ymm13,YMMWORD[((-24+192-128))+rcx]
+ vpaddq ymm3,ymm4,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ vmovdqu ymm0,YMMWORD[((-24+224-128))+rcx]
+ vpaddq ymm4,ymm5,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vmovdqu ymm12,YMMWORD[((-24+256-128))+rcx]
+ vpaddq ymm5,ymm6,ymm13
+ vpmuludq ymm0,ymm0,ymm11
+ vmovdqu ymm13,YMMWORD[((-24+288-128))+rcx]
+ mov r9,r12
+ vpaddq ymm6,ymm7,ymm0
+ vpmuludq ymm12,ymm12,ymm11
+ add r9,QWORD[rsp]
+ vpaddq ymm7,ymm8,ymm12
+ vpmuludq ymm13,ymm13,ymm11
+ vmovq xmm12,r12
+ vpaddq ymm8,ymm9,ymm13
+
+ dec r14d
+ jnz NEAR $L$oop_mul_1024
+ vpaddq ymm0,ymm12,YMMWORD[rsp]
+
+ vpsrlq ymm12,ymm0,29
+ vpand ymm0,ymm0,ymm15
+ vpsrlq ymm13,ymm1,29
+ vpand ymm1,ymm1,ymm15
+ vpsrlq ymm10,ymm2,29
+ vpermq ymm12,ymm12,0x93
+ vpand ymm2,ymm2,ymm15
+ vpsrlq ymm11,ymm3,29
+ vpermq ymm13,ymm13,0x93
+ vpand ymm3,ymm3,ymm15
+
+ vpblendd ymm9,ymm12,ymm14,3
+ vpermq ymm10,ymm10,0x93
+ vpblendd ymm12,ymm13,ymm12,3
+ vpermq ymm11,ymm11,0x93
+ vpaddq ymm0,ymm0,ymm9
+ vpblendd ymm13,ymm10,ymm13,3
+ vpaddq ymm1,ymm1,ymm12
+ vpblendd ymm10,ymm11,ymm10,3
+ vpaddq ymm2,ymm2,ymm13
+ vpblendd ymm11,ymm14,ymm11,3
+ vpaddq ymm3,ymm3,ymm10
+ vpaddq ymm4,ymm4,ymm11
+
+ vpsrlq ymm12,ymm0,29
+ vpand ymm0,ymm0,ymm15
+ vpsrlq ymm13,ymm1,29
+ vpand ymm1,ymm1,ymm15
+ vpsrlq ymm10,ymm2,29
+ vpermq ymm12,ymm12,0x93
+ vpand ymm2,ymm2,ymm15
+ vpsrlq ymm11,ymm3,29
+ vpermq ymm13,ymm13,0x93
+ vpand ymm3,ymm3,ymm15
+ vpermq ymm10,ymm10,0x93
+
+ vpblendd ymm9,ymm12,ymm14,3
+ vpermq ymm11,ymm11,0x93
+ vpblendd ymm12,ymm13,ymm12,3
+ vpaddq ymm0,ymm0,ymm9
+ vpblendd ymm13,ymm10,ymm13,3
+ vpaddq ymm1,ymm1,ymm12
+ vpblendd ymm10,ymm11,ymm10,3
+ vpaddq ymm2,ymm2,ymm13
+ vpblendd ymm11,ymm14,ymm11,3
+ vpaddq ymm3,ymm3,ymm10
+ vpaddq ymm4,ymm4,ymm11
+
+ vmovdqu YMMWORD[(0-128)+rdi],ymm0
+ vmovdqu YMMWORD[(32-128)+rdi],ymm1
+ vmovdqu YMMWORD[(64-128)+rdi],ymm2
+ vmovdqu YMMWORD[(96-128)+rdi],ymm3
+ vpsrlq ymm12,ymm4,29
+ vpand ymm4,ymm4,ymm15
+ vpsrlq ymm13,ymm5,29
+ vpand ymm5,ymm5,ymm15
+ vpsrlq ymm10,ymm6,29
+ vpermq ymm12,ymm12,0x93
+ vpand ymm6,ymm6,ymm15
+ vpsrlq ymm11,ymm7,29
+ vpermq ymm13,ymm13,0x93
+ vpand ymm7,ymm7,ymm15
+ vpsrlq ymm0,ymm8,29
+ vpermq ymm10,ymm10,0x93
+ vpand ymm8,ymm8,ymm15
+ vpermq ymm11,ymm11,0x93
+
+ vpblendd ymm9,ymm12,ymm14,3
+ vpermq ymm0,ymm0,0x93
+ vpblendd ymm12,ymm13,ymm12,3
+ vpaddq ymm4,ymm4,ymm9
+ vpblendd ymm13,ymm10,ymm13,3
+ vpaddq ymm5,ymm5,ymm12
+ vpblendd ymm10,ymm11,ymm10,3
+ vpaddq ymm6,ymm6,ymm13
+ vpblendd ymm11,ymm0,ymm11,3
+ vpaddq ymm7,ymm7,ymm10
+ vpaddq ymm8,ymm8,ymm11
+
+ vpsrlq ymm12,ymm4,29
+ vpand ymm4,ymm4,ymm15
+ vpsrlq ymm13,ymm5,29
+ vpand ymm5,ymm5,ymm15
+ vpsrlq ymm10,ymm6,29
+ vpermq ymm12,ymm12,0x93
+ vpand ymm6,ymm6,ymm15
+ vpsrlq ymm11,ymm7,29
+ vpermq ymm13,ymm13,0x93
+ vpand ymm7,ymm7,ymm15
+ vpsrlq ymm0,ymm8,29
+ vpermq ymm10,ymm10,0x93
+ vpand ymm8,ymm8,ymm15
+ vpermq ymm11,ymm11,0x93
+
+ vpblendd ymm9,ymm12,ymm14,3
+ vpermq ymm0,ymm0,0x93
+ vpblendd ymm12,ymm13,ymm12,3
+ vpaddq ymm4,ymm4,ymm9
+ vpblendd ymm13,ymm10,ymm13,3
+ vpaddq ymm5,ymm5,ymm12
+ vpblendd ymm10,ymm11,ymm10,3
+ vpaddq ymm6,ymm6,ymm13
+ vpblendd ymm11,ymm0,ymm11,3
+ vpaddq ymm7,ymm7,ymm10
+ vpaddq ymm8,ymm8,ymm11
+
+ vmovdqu YMMWORD[(128-128)+rdi],ymm4
+ vmovdqu YMMWORD[(160-128)+rdi],ymm5
+ vmovdqu YMMWORD[(192-128)+rdi],ymm6
+ vmovdqu YMMWORD[(224-128)+rdi],ymm7
+ vmovdqu YMMWORD[(256-128)+rdi],ymm8
+ vzeroupper
+
+ mov rax,rbp
+
+$L$mul_1024_in_tail:
+ movaps xmm6,XMMWORD[((-216))+rax]
+ movaps xmm7,XMMWORD[((-200))+rax]
+ movaps xmm8,XMMWORD[((-184))+rax]
+ movaps xmm9,XMMWORD[((-168))+rax]
+ movaps xmm10,XMMWORD[((-152))+rax]
+ movaps xmm11,XMMWORD[((-136))+rax]
+ movaps xmm12,XMMWORD[((-120))+rax]
+ movaps xmm13,XMMWORD[((-104))+rax]
+ movaps xmm14,XMMWORD[((-88))+rax]
+ movaps xmm15,XMMWORD[((-72))+rax]
+ mov r15,QWORD[((-48))+rax]
+
+ mov r14,QWORD[((-40))+rax]
+
+ mov r13,QWORD[((-32))+rax]
+
+ mov r12,QWORD[((-24))+rax]
+
+ mov rbp,QWORD[((-16))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+
+ lea rsp,[rax]
+
+$L$mul_1024_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_rsaz_1024_mul_avx2:
+global rsaz_1024_red2norm_avx2
+
+ALIGN 32
+rsaz_1024_red2norm_avx2:
+
+_CET_ENDBR
+ sub rdx,-128
+ xor rax,rax
+ mov r8,QWORD[((-128))+rdx]
+ mov r9,QWORD[((-120))+rdx]
+ mov r10,QWORD[((-112))+rdx]
+ shl r8,0
+ shl r9,29
+ mov r11,r10
+ shl r10,58
+ shr r11,6
+ add rax,r8
+ add rax,r9
+ add rax,r10
+ adc r11,0
+ mov QWORD[rcx],rax
+ mov rax,r11
+ mov r8,QWORD[((-104))+rdx]
+ mov r9,QWORD[((-96))+rdx]
+ shl r8,23
+ mov r10,r9
+ shl r9,52
+ shr r10,12
+ add rax,r8
+ add rax,r9
+ adc r10,0
+ mov QWORD[8+rcx],rax
+ mov rax,r10
+ mov r11,QWORD[((-88))+rdx]
+ mov r8,QWORD[((-80))+rdx]
+ shl r11,17
+ mov r9,r8
+ shl r8,46
+ shr r9,18
+ add rax,r11
+ add rax,r8
+ adc r9,0
+ mov QWORD[16+rcx],rax
+ mov rax,r9
+ mov r10,QWORD[((-72))+rdx]
+ mov r11,QWORD[((-64))+rdx]
+ shl r10,11
+ mov r8,r11
+ shl r11,40
+ shr r8,24
+ add rax,r10
+ add rax,r11
+ adc r8,0
+ mov QWORD[24+rcx],rax
+ mov rax,r8
+ mov r9,QWORD[((-56))+rdx]
+ mov r10,QWORD[((-48))+rdx]
+ mov r11,QWORD[((-40))+rdx]
+ shl r9,5
+ shl r10,34
+ mov r8,r11
+ shl r11,63
+ shr r8,1
+ add rax,r9
+ add rax,r10
+ add rax,r11
+ adc r8,0
+ mov QWORD[32+rcx],rax
+ mov rax,r8
+ mov r9,QWORD[((-32))+rdx]
+ mov r10,QWORD[((-24))+rdx]
+ shl r9,28
+ mov r11,r10
+ shl r10,57
+ shr r11,7
+ add rax,r9
+ add rax,r10
+ adc r11,0
+ mov QWORD[40+rcx],rax
+ mov rax,r11
+ mov r8,QWORD[((-16))+rdx]
+ mov r9,QWORD[((-8))+rdx]
+ shl r8,22
+ mov r10,r9
+ shl r9,51
+ shr r10,13
+ add rax,r8
+ add rax,r9
+ adc r10,0
+ mov QWORD[48+rcx],rax
+ mov rax,r10
+ mov r11,QWORD[rdx]
+ mov r8,QWORD[8+rdx]
+ shl r11,16
+ mov r9,r8
+ shl r8,45
+ shr r9,19
+ add rax,r11
+ add rax,r8
+ adc r9,0
+ mov QWORD[56+rcx],rax
+ mov rax,r9
+ mov r10,QWORD[16+rdx]
+ mov r11,QWORD[24+rdx]
+ shl r10,10
+ mov r8,r11
+ shl r11,39
+ shr r8,25
+ add rax,r10
+ add rax,r11
+ adc r8,0
+ mov QWORD[64+rcx],rax
+ mov rax,r8
+ mov r9,QWORD[32+rdx]
+ mov r10,QWORD[40+rdx]
+ mov r11,QWORD[48+rdx]
+ shl r9,4
+ shl r10,33
+ mov r8,r11
+ shl r11,62
+ shr r8,2
+ add rax,r9
+ add rax,r10
+ add rax,r11
+ adc r8,0
+ mov QWORD[72+rcx],rax
+ mov rax,r8
+ mov r9,QWORD[56+rdx]
+ mov r10,QWORD[64+rdx]
+ shl r9,27
+ mov r11,r10
+ shl r10,56
+ shr r11,8
+ add rax,r9
+ add rax,r10
+ adc r11,0
+ mov QWORD[80+rcx],rax
+ mov rax,r11
+ mov r8,QWORD[72+rdx]
+ mov r9,QWORD[80+rdx]
+ shl r8,21
+ mov r10,r9
+ shl r9,50
+ shr r10,14
+ add rax,r8
+ add rax,r9
+ adc r10,0
+ mov QWORD[88+rcx],rax
+ mov rax,r10
+ mov r11,QWORD[88+rdx]
+ mov r8,QWORD[96+rdx]
+ shl r11,15
+ mov r9,r8
+ shl r8,44
+ shr r9,20
+ add rax,r11
+ add rax,r8
+ adc r9,0
+ mov QWORD[96+rcx],rax
+ mov rax,r9
+ mov r10,QWORD[104+rdx]
+ mov r11,QWORD[112+rdx]
+ shl r10,9
+ mov r8,r11
+ shl r11,38
+ shr r8,26
+ add rax,r10
+ add rax,r11
+ adc r8,0
+ mov QWORD[104+rcx],rax
+ mov rax,r8
+ mov r9,QWORD[120+rdx]
+ mov r10,QWORD[128+rdx]
+ mov r11,QWORD[136+rdx]
+ shl r9,3
+ shl r10,32
+ mov r8,r11
+ shl r11,61
+ shr r8,3
+ add rax,r9
+ add rax,r10
+ add rax,r11
+ adc r8,0
+ mov QWORD[112+rcx],rax
+ mov rax,r8
+ mov r9,QWORD[144+rdx]
+ mov r10,QWORD[152+rdx]
+ shl r9,26
+ mov r11,r10
+ shl r10,55
+ shr r11,9
+ add rax,r9
+ add rax,r10
+ adc r11,0
+ mov QWORD[120+rcx],rax
+ mov rax,r11
+ ret
+
+
+
+global rsaz_1024_norm2red_avx2
+
+ALIGN 32
+rsaz_1024_norm2red_avx2:
+
+_CET_ENDBR
+ sub rcx,-128
+ mov r8,QWORD[rdx]
+ mov eax,0x1fffffff
+ mov r9,QWORD[8+rdx]
+ mov r11,r8
+ shr r11,0
+ and r11,rax
+ mov QWORD[((-128))+rcx],r11
+ mov r10,r8
+ shr r10,29
+ and r10,rax
+ mov QWORD[((-120))+rcx],r10
+ shrd r8,r9,58
+ and r8,rax
+ mov QWORD[((-112))+rcx],r8
+ mov r10,QWORD[16+rdx]
+ mov r8,r9
+ shr r8,23
+ and r8,rax
+ mov QWORD[((-104))+rcx],r8
+ shrd r9,r10,52
+ and r9,rax
+ mov QWORD[((-96))+rcx],r9
+ mov r11,QWORD[24+rdx]
+ mov r9,r10
+ shr r9,17
+ and r9,rax
+ mov QWORD[((-88))+rcx],r9
+ shrd r10,r11,46
+ and r10,rax
+ mov QWORD[((-80))+rcx],r10
+ mov r8,QWORD[32+rdx]
+ mov r10,r11
+ shr r10,11
+ and r10,rax
+ mov QWORD[((-72))+rcx],r10
+ shrd r11,r8,40
+ and r11,rax
+ mov QWORD[((-64))+rcx],r11
+ mov r9,QWORD[40+rdx]
+ mov r11,r8
+ shr r11,5
+ and r11,rax
+ mov QWORD[((-56))+rcx],r11
+ mov r10,r8
+ shr r10,34
+ and r10,rax
+ mov QWORD[((-48))+rcx],r10
+ shrd r8,r9,63
+ and r8,rax
+ mov QWORD[((-40))+rcx],r8
+ mov r10,QWORD[48+rdx]
+ mov r8,r9
+ shr r8,28
+ and r8,rax
+ mov QWORD[((-32))+rcx],r8
+ shrd r9,r10,57
+ and r9,rax
+ mov QWORD[((-24))+rcx],r9
+ mov r11,QWORD[56+rdx]
+ mov r9,r10
+ shr r9,22
+ and r9,rax
+ mov QWORD[((-16))+rcx],r9
+ shrd r10,r11,51
+ and r10,rax
+ mov QWORD[((-8))+rcx],r10
+ mov r8,QWORD[64+rdx]
+ mov r10,r11
+ shr r10,16
+ and r10,rax
+ mov QWORD[rcx],r10
+ shrd r11,r8,45
+ and r11,rax
+ mov QWORD[8+rcx],r11
+ mov r9,QWORD[72+rdx]
+ mov r11,r8
+ shr r11,10
+ and r11,rax
+ mov QWORD[16+rcx],r11
+ shrd r8,r9,39
+ and r8,rax
+ mov QWORD[24+rcx],r8
+ mov r10,QWORD[80+rdx]
+ mov r8,r9
+ shr r8,4
+ and r8,rax
+ mov QWORD[32+rcx],r8
+ mov r11,r9
+ shr r11,33
+ and r11,rax
+ mov QWORD[40+rcx],r11
+ shrd r9,r10,62
+ and r9,rax
+ mov QWORD[48+rcx],r9
+ mov r11,QWORD[88+rdx]
+ mov r9,r10
+ shr r9,27
+ and r9,rax
+ mov QWORD[56+rcx],r9
+ shrd r10,r11,56
+ and r10,rax
+ mov QWORD[64+rcx],r10
+ mov r8,QWORD[96+rdx]
+ mov r10,r11
+ shr r10,21
+ and r10,rax
+ mov QWORD[72+rcx],r10
+ shrd r11,r8,50
+ and r11,rax
+ mov QWORD[80+rcx],r11
+ mov r9,QWORD[104+rdx]
+ mov r11,r8
+ shr r11,15
+ and r11,rax
+ mov QWORD[88+rcx],r11
+ shrd r8,r9,44
+ and r8,rax
+ mov QWORD[96+rcx],r8
+ mov r10,QWORD[112+rdx]
+ mov r8,r9
+ shr r8,9
+ and r8,rax
+ mov QWORD[104+rcx],r8
+ shrd r9,r10,38
+ and r9,rax
+ mov QWORD[112+rcx],r9
+ mov r11,QWORD[120+rdx]
+ mov r9,r10
+ shr r9,3
+ and r9,rax
+ mov QWORD[120+rcx],r9
+ mov r8,r10
+ shr r8,32
+ and r8,rax
+ mov QWORD[128+rcx],r8
+ shrd r10,r11,61
+ and r10,rax
+ mov QWORD[136+rcx],r10
+ xor r8,r8
+ mov r10,r11
+ shr r10,26
+ and r10,rax
+ mov QWORD[144+rcx],r10
+ shrd r11,r8,55
+ and r11,rax
+ mov QWORD[152+rcx],r11
+ mov QWORD[160+rcx],r8
+ mov QWORD[168+rcx],r8
+ mov QWORD[176+rcx],r8
+ mov QWORD[184+rcx],r8
+ ret
+
+
+global rsaz_1024_scatter5_avx2
+
+ALIGN 32
+rsaz_1024_scatter5_avx2:
+
+_CET_ENDBR
+ vzeroupper
+ vmovdqu ymm5,YMMWORD[$L$scatter_permd]
+ shl r8d,4
+ lea rcx,[r8*1+rcx]
+ mov eax,9
+ jmp NEAR $L$oop_scatter_1024
+
+ALIGN 32
+$L$oop_scatter_1024:
+ vmovdqu ymm0,YMMWORD[rdx]
+ lea rdx,[32+rdx]
+ vpermd ymm0,ymm5,ymm0
+ vmovdqu XMMWORD[rcx],xmm0
+ lea rcx,[512+rcx]
+ dec eax
+ jnz NEAR $L$oop_scatter_1024
+
+ vzeroupper
+ ret
+
+
+
+global rsaz_1024_gather5_avx2
+
+ALIGN 32
+rsaz_1024_gather5_avx2:
+
+_CET_ENDBR
+ vzeroupper
+ mov r11,rsp
+
+ lea rax,[((-136))+rsp]
+$L$SEH_begin_rsaz_1024_gather5:
+
+ DB 0x48,0x8d,0x60,0xe0
+ DB 0xc5,0xf8,0x29,0x70,0xe0
+ DB 0xc5,0xf8,0x29,0x78,0xf0
+ DB 0xc5,0x78,0x29,0x40,0x00
+ DB 0xc5,0x78,0x29,0x48,0x10
+ DB 0xc5,0x78,0x29,0x50,0x20
+ DB 0xc5,0x78,0x29,0x58,0x30
+ DB 0xc5,0x78,0x29,0x60,0x40
+ DB 0xc5,0x78,0x29,0x68,0x50
+ DB 0xc5,0x78,0x29,0x70,0x60
+ DB 0xc5,0x78,0x29,0x78,0x70
+ lea rsp,[((-256))+rsp]
+ and rsp,-32
+ lea r10,[$L$inc]
+ lea rax,[((-128))+rsp]
+
+ vmovd xmm4,r8d
+ vmovdqa ymm0,YMMWORD[r10]
+ vmovdqa ymm1,YMMWORD[32+r10]
+ vmovdqa ymm5,YMMWORD[64+r10]
+ vpbroadcastd ymm4,xmm4
+
+ vpaddd ymm2,ymm0,ymm5
+ vpcmpeqd ymm0,ymm0,ymm4
+ vpaddd ymm3,ymm1,ymm5
+ vpcmpeqd ymm1,ymm1,ymm4
+ vmovdqa YMMWORD[(0+128)+rax],ymm0
+ vpaddd ymm0,ymm2,ymm5
+ vpcmpeqd ymm2,ymm2,ymm4
+ vmovdqa YMMWORD[(32+128)+rax],ymm1
+ vpaddd ymm1,ymm3,ymm5
+ vpcmpeqd ymm3,ymm3,ymm4
+ vmovdqa YMMWORD[(64+128)+rax],ymm2
+ vpaddd ymm2,ymm0,ymm5
+ vpcmpeqd ymm0,ymm0,ymm4
+ vmovdqa YMMWORD[(96+128)+rax],ymm3
+ vpaddd ymm3,ymm1,ymm5
+ vpcmpeqd ymm1,ymm1,ymm4
+ vmovdqa YMMWORD[(128+128)+rax],ymm0
+ vpaddd ymm8,ymm2,ymm5
+ vpcmpeqd ymm2,ymm2,ymm4
+ vmovdqa YMMWORD[(160+128)+rax],ymm1
+ vpaddd ymm9,ymm3,ymm5
+ vpcmpeqd ymm3,ymm3,ymm4
+ vmovdqa YMMWORD[(192+128)+rax],ymm2
+ vpaddd ymm10,ymm8,ymm5
+ vpcmpeqd ymm8,ymm8,ymm4
+ vmovdqa YMMWORD[(224+128)+rax],ymm3
+ vpaddd ymm11,ymm9,ymm5
+ vpcmpeqd ymm9,ymm9,ymm4
+ vpaddd ymm12,ymm10,ymm5
+ vpcmpeqd ymm10,ymm10,ymm4
+ vpaddd ymm13,ymm11,ymm5
+ vpcmpeqd ymm11,ymm11,ymm4
+ vpaddd ymm14,ymm12,ymm5
+ vpcmpeqd ymm12,ymm12,ymm4
+ vpaddd ymm15,ymm13,ymm5
+ vpcmpeqd ymm13,ymm13,ymm4
+ vpcmpeqd ymm14,ymm14,ymm4
+ vpcmpeqd ymm15,ymm15,ymm4
+
+ vmovdqa ymm7,YMMWORD[((-32))+r10]
+ lea rdx,[128+rdx]
+ mov r8d,9
+
+$L$oop_gather_1024:
+ vmovdqa ymm0,YMMWORD[((0-128))+rdx]
+ vmovdqa ymm1,YMMWORD[((32-128))+rdx]
+ vmovdqa ymm2,YMMWORD[((64-128))+rdx]
+ vmovdqa ymm3,YMMWORD[((96-128))+rdx]
+ vpand ymm0,ymm0,YMMWORD[((0+128))+rax]
+ vpand ymm1,ymm1,YMMWORD[((32+128))+rax]
+ vpand ymm2,ymm2,YMMWORD[((64+128))+rax]
+ vpor ymm4,ymm1,ymm0
+ vpand ymm3,ymm3,YMMWORD[((96+128))+rax]
+ vmovdqa ymm0,YMMWORD[((128-128))+rdx]
+ vmovdqa ymm1,YMMWORD[((160-128))+rdx]
+ vpor ymm5,ymm3,ymm2
+ vmovdqa ymm2,YMMWORD[((192-128))+rdx]
+ vmovdqa ymm3,YMMWORD[((224-128))+rdx]
+ vpand ymm0,ymm0,YMMWORD[((128+128))+rax]
+ vpand ymm1,ymm1,YMMWORD[((160+128))+rax]
+ vpand ymm2,ymm2,YMMWORD[((192+128))+rax]
+ vpor ymm4,ymm4,ymm0
+ vpand ymm3,ymm3,YMMWORD[((224+128))+rax]
+ vpand ymm0,ymm8,YMMWORD[((256-128))+rdx]
+ vpor ymm5,ymm5,ymm1
+ vpand ymm1,ymm9,YMMWORD[((288-128))+rdx]
+ vpor ymm4,ymm4,ymm2
+ vpand ymm2,ymm10,YMMWORD[((320-128))+rdx]
+ vpor ymm5,ymm5,ymm3
+ vpand ymm3,ymm11,YMMWORD[((352-128))+rdx]
+ vpor ymm4,ymm4,ymm0
+ vpand ymm0,ymm12,YMMWORD[((384-128))+rdx]
+ vpor ymm5,ymm5,ymm1
+ vpand ymm1,ymm13,YMMWORD[((416-128))+rdx]
+ vpor ymm4,ymm4,ymm2
+ vpand ymm2,ymm14,YMMWORD[((448-128))+rdx]
+ vpor ymm5,ymm5,ymm3
+ vpand ymm3,ymm15,YMMWORD[((480-128))+rdx]
+ lea rdx,[512+rdx]
+ vpor ymm4,ymm4,ymm0
+ vpor ymm5,ymm5,ymm1
+ vpor ymm4,ymm4,ymm2
+ vpor ymm5,ymm5,ymm3
+
+ vpor ymm4,ymm4,ymm5
+ vextracti128 xmm5,ymm4,1
+ vpor xmm5,xmm5,xmm4
+ vpermd ymm5,ymm7,ymm5
+ vmovdqu YMMWORD[rcx],ymm5
+ lea rcx,[32+rcx]
+ dec r8d
+ jnz NEAR $L$oop_gather_1024
+
+ vpxor ymm0,ymm0,ymm0
+ vmovdqu YMMWORD[rcx],ymm0
+ vzeroupper
+ movaps xmm6,XMMWORD[((-168))+r11]
+ movaps xmm7,XMMWORD[((-152))+r11]
+ movaps xmm8,XMMWORD[((-136))+r11]
+ movaps xmm9,XMMWORD[((-120))+r11]
+ movaps xmm10,XMMWORD[((-104))+r11]
+ movaps xmm11,XMMWORD[((-88))+r11]
+ movaps xmm12,XMMWORD[((-72))+r11]
+ movaps xmm13,XMMWORD[((-56))+r11]
+ movaps xmm14,XMMWORD[((-40))+r11]
+ movaps xmm15,XMMWORD[((-24))+r11]
+ lea rsp,[r11]
+
+ ret
+
+$L$SEH_end_rsaz_1024_gather5:
+
+section .rdata rdata align=8
+ALIGN 64
+$L$and_mask:
+ DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+$L$scatter_permd:
+ DD 0,2,4,6,7,7,7,7
+$L$gather_permd:
+ DD 0,7,1,7,2,7,3,7
+$L$inc:
+ DD 0,0,0,0,1,1,1,1
+ DD 2,2,2,2,3,3,3,3
+ DD 4,4,4,4,4,4,4,4
+ALIGN 64
+section .text
+
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+rsaz_se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov rbp,QWORD[160+r8]
+
+ mov r10d,DWORD[8+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ cmovc rax,rbp
+
+ mov r15,QWORD[((-48))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov rbx,QWORD[((-8))+rax]
+ mov QWORD[240+r8],r15
+ mov QWORD[232+r8],r14
+ mov QWORD[224+r8],r13
+ mov QWORD[216+r8],r12
+ mov QWORD[160+r8],rbp
+ mov QWORD[144+r8],rbx
+
+ lea rsi,[((-216))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_rsaz_1024_sqr_avx2 wrt ..imagebase
+ DD $L$SEH_end_rsaz_1024_sqr_avx2 wrt ..imagebase
+ DD $L$SEH_info_rsaz_1024_sqr_avx2 wrt ..imagebase
+
+ DD $L$SEH_begin_rsaz_1024_mul_avx2 wrt ..imagebase
+ DD $L$SEH_end_rsaz_1024_mul_avx2 wrt ..imagebase
+ DD $L$SEH_info_rsaz_1024_mul_avx2 wrt ..imagebase
+
+ DD $L$SEH_begin_rsaz_1024_gather5 wrt ..imagebase
+ DD $L$SEH_end_rsaz_1024_gather5 wrt ..imagebase
+ DD $L$SEH_info_rsaz_1024_gather5 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_rsaz_1024_sqr_avx2:
+ DB 9,0,0,0
+ DD rsaz_se_handler wrt ..imagebase
+ DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase
+ DD 0
+$L$SEH_info_rsaz_1024_mul_avx2:
+ DB 9,0,0,0
+ DD rsaz_se_handler wrt ..imagebase
+ DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase
+ DD 0
+$L$SEH_info_rsaz_1024_gather5:
+ DB 0x01,0x36,0x17,0x0b
+ DB 0x36,0xf8,0x09,0x00
+ DB 0x31,0xe8,0x08,0x00
+ DB 0x2c,0xd8,0x07,0x00
+ DB 0x27,0xc8,0x06,0x00
+ DB 0x22,0xb8,0x05,0x00
+ DB 0x1d,0xa8,0x04,0x00
+ DB 0x18,0x98,0x03,0x00
+ DB 0x13,0x88,0x02,0x00
+ DB 0x0e,0x78,0x01,0x00
+ DB 0x09,0x68,0x00,0x00
+ DB 0x04,0x01,0x15,0x00
+ DB 0x00,0xb3,0x00,0x00
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha1-586-apple.S b/gen/bcm/sha1-586-apple.S
new file mode 100644
index 0000000..f0ab02b
--- /dev/null
+++ b/gen/bcm/sha1-586-apple.S
@@ -0,0 +1,3782 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _sha1_block_data_order_nohw
+.private_extern _sha1_block_data_order_nohw
+.align 4
+_sha1_block_data_order_nohw:
+L_sha1_block_data_order_nohw_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%ebp
+ movl 24(%esp),%esi
+ movl 28(%esp),%eax
+ subl $76,%esp
+ shll $6,%eax
+ addl %esi,%eax
+ movl %eax,104(%esp)
+ movl 16(%ebp),%edi
+ jmp L000loop
+.align 4,0x90
+L000loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl %ecx,8(%esp)
+ movl %edx,12(%esp)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %edx,28(%esp)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edx,44(%esp)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,48(%esp)
+ movl %ebx,52(%esp)
+ movl %ecx,56(%esp)
+ movl %edx,60(%esp)
+ movl %esi,100(%esp)
+ movl (%ebp),%eax
+ movl 4(%ebp),%ebx
+ movl 8(%ebp),%ecx
+ movl 12(%ebp),%edx
+ # 00_15 0
+ movl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl (%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+ # 00_15 1
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 4(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+ # 00_15 2
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 8(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+ # 00_15 3
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 12(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+ # 00_15 4
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 16(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+ # 00_15 5
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 20(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+ # 00_15 6
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 24(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+ # 00_15 7
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 28(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+ # 00_15 8
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 32(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+ # 00_15 9
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 36(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+ # 00_15 10
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 40(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+ # 00_15 11
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 44(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+ # 00_15 12
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 48(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+ # 00_15 13
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 52(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+ # 00_15 14
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 56(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+ # 00_15 15
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 60(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+ # 16_19 16
+ movl %edi,%ebp
+ xorl 8(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 32(%esp),%ebx
+ andl %edx,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ xorl %esi,%ebp
+ addl %ebp,%eax
+ movl %ecx,%ebp
+ rorl $2,%edx
+ movl %ebx,(%esp)
+ roll $5,%ebp
+ leal 1518500249(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+ # 16_19 17
+ movl %edx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edi,%ebp
+ xorl 36(%esp),%eax
+ andl %ecx,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ xorl %edi,%ebp
+ addl %ebp,%esi
+ movl %ebx,%ebp
+ rorl $2,%ecx
+ movl %eax,4(%esp)
+ roll $5,%ebp
+ leal 1518500249(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+ # 16_19 18
+ movl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 40(%esp),%esi
+ andl %ebx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ xorl %edx,%ebp
+ addl %ebp,%edi
+ movl %eax,%ebp
+ rorl $2,%ebx
+ movl %esi,8(%esp)
+ roll $5,%ebp
+ leal 1518500249(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+ # 16_19 19
+ movl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 44(%esp),%edi
+ andl %eax,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ xorl %ecx,%ebp
+ addl %ebp,%edx
+ movl %esi,%ebp
+ rorl $2,%eax
+ movl %edi,12(%esp)
+ roll $5,%ebp
+ leal 1518500249(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+ # 20_39 20
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+ # 20_39 21
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+ # 20_39 22
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+ # 20_39 23
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+ # 20_39 24
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+ # 20_39 25
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+ # 20_39 26
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+ # 20_39 27
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+ # 20_39 28
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+ # 20_39 29
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,52(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+ # 20_39 30
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,56(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+ # 20_39 31
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,60(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl (%esp),%edx
+ addl %ebp,%edi
+ # 20_39 32
+ movl %esi,%ebp
+ xorl 8(%esp),%edx
+ xorl %eax,%ebp
+ xorl 32(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 4(%esp),%ecx
+ addl %ebp,%edx
+ # 20_39 33
+ movl %edi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 56(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,4(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 8(%esp),%ebx
+ addl %ebp,%ecx
+ # 20_39 34
+ movl %edx,%ebp
+ xorl 16(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,8(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 12(%esp),%eax
+ addl %ebp,%ebx
+ # 20_39 35
+ movl %ecx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl (%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,12(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 16(%esp),%esi
+ addl %ebp,%eax
+ # 20_39 36
+ movl %ebx,%ebp
+ xorl 24(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 4(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,16(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 20(%esp),%edi
+ addl %ebp,%esi
+ # 20_39 37
+ movl %eax,%ebp
+ xorl 28(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 8(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,20(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 24(%esp),%edx
+ addl %ebp,%edi
+ # 20_39 38
+ movl %esi,%ebp
+ xorl 32(%esp),%edx
+ xorl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 12(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,24(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 28(%esp),%ecx
+ addl %ebp,%edx
+ # 20_39 39
+ movl %edi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 16(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,28(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 32(%esp),%ebx
+ addl %ebp,%ecx
+ # 40_59 40
+ movl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl (%esp),%ebx
+ andl %edx,%ebp
+ xorl 20(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,32(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 36(%esp),%eax
+ addl %ebp,%ebx
+ # 40_59 41
+ movl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl 4(%esp),%eax
+ andl %ecx,%ebp
+ xorl 24(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,36(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 40(%esp),%esi
+ addl %ebp,%eax
+ # 40_59 42
+ movl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 8(%esp),%esi
+ andl %ebx,%ebp
+ xorl 28(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,40(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 44(%esp),%edi
+ addl %ebp,%esi
+ # 40_59 43
+ movl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 12(%esp),%edi
+ andl %eax,%ebp
+ xorl 32(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,44(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 48(%esp),%edx
+ addl %ebp,%edi
+ # 40_59 44
+ movl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 16(%esp),%edx
+ andl %esi,%ebp
+ xorl 36(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,48(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 52(%esp),%ecx
+ addl %ebp,%edx
+ # 40_59 45
+ movl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 20(%esp),%ecx
+ andl %edi,%ebp
+ xorl 40(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,52(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 56(%esp),%ebx
+ addl %ebp,%ecx
+ # 40_59 46
+ movl %edi,%ebp
+ xorl (%esp),%ebx
+ xorl %esi,%ebp
+ xorl 24(%esp),%ebx
+ andl %edx,%ebp
+ xorl 44(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,56(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 60(%esp),%eax
+ addl %ebp,%ebx
+ # 40_59 47
+ movl %edx,%ebp
+ xorl 4(%esp),%eax
+ xorl %edi,%ebp
+ xorl 28(%esp),%eax
+ andl %ecx,%ebp
+ xorl 48(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,60(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl (%esp),%esi
+ addl %ebp,%eax
+ # 40_59 48
+ movl %ecx,%ebp
+ xorl 8(%esp),%esi
+ xorl %edx,%ebp
+ xorl 32(%esp),%esi
+ andl %ebx,%ebp
+ xorl 52(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 4(%esp),%edi
+ addl %ebp,%esi
+ # 40_59 49
+ movl %ebx,%ebp
+ xorl 12(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 36(%esp),%edi
+ andl %eax,%ebp
+ xorl 56(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,4(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 8(%esp),%edx
+ addl %ebp,%edi
+ # 40_59 50
+ movl %eax,%ebp
+ xorl 16(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 40(%esp),%edx
+ andl %esi,%ebp
+ xorl 60(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,8(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 12(%esp),%ecx
+ addl %ebp,%edx
+ # 40_59 51
+ movl %esi,%ebp
+ xorl 20(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 44(%esp),%ecx
+ andl %edi,%ebp
+ xorl (%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,12(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 16(%esp),%ebx
+ addl %ebp,%ecx
+ # 40_59 52
+ movl %edi,%ebp
+ xorl 24(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 48(%esp),%ebx
+ andl %edx,%ebp
+ xorl 4(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,16(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 20(%esp),%eax
+ addl %ebp,%ebx
+ # 40_59 53
+ movl %edx,%ebp
+ xorl 28(%esp),%eax
+ xorl %edi,%ebp
+ xorl 52(%esp),%eax
+ andl %ecx,%ebp
+ xorl 8(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,20(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 24(%esp),%esi
+ addl %ebp,%eax
+ # 40_59 54
+ movl %ecx,%ebp
+ xorl 32(%esp),%esi
+ xorl %edx,%ebp
+ xorl 56(%esp),%esi
+ andl %ebx,%ebp
+ xorl 12(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,24(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 28(%esp),%edi
+ addl %ebp,%esi
+ # 40_59 55
+ movl %ebx,%ebp
+ xorl 36(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 60(%esp),%edi
+ andl %eax,%ebp
+ xorl 16(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,28(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 32(%esp),%edx
+ addl %ebp,%edi
+ # 40_59 56
+ movl %eax,%ebp
+ xorl 40(%esp),%edx
+ xorl %ebx,%ebp
+ xorl (%esp),%edx
+ andl %esi,%ebp
+ xorl 20(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,32(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 36(%esp),%ecx
+ addl %ebp,%edx
+ # 40_59 57
+ movl %esi,%ebp
+ xorl 44(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 4(%esp),%ecx
+ andl %edi,%ebp
+ xorl 24(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,36(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 40(%esp),%ebx
+ addl %ebp,%ecx
+ # 40_59 58
+ movl %edi,%ebp
+ xorl 48(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 8(%esp),%ebx
+ andl %edx,%ebp
+ xorl 28(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,40(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 44(%esp),%eax
+ addl %ebp,%ebx
+ # 40_59 59
+ movl %edx,%ebp
+ xorl 52(%esp),%eax
+ xorl %edi,%ebp
+ xorl 12(%esp),%eax
+ andl %ecx,%ebp
+ xorl 32(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,44(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 48(%esp),%esi
+ addl %ebp,%eax
+ # 20_39 60
+ movl %ebx,%ebp
+ xorl 56(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 36(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,48(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 52(%esp),%edi
+ addl %ebp,%esi
+ # 20_39 61
+ movl %eax,%ebp
+ xorl 60(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,52(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 56(%esp),%edx
+ addl %ebp,%edi
+ # 20_39 62
+ movl %esi,%ebp
+ xorl (%esp),%edx
+ xorl %eax,%ebp
+ xorl 24(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,56(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 60(%esp),%ecx
+ addl %ebp,%edx
+ # 20_39 63
+ movl %edi,%ebp
+ xorl 4(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 48(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,60(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+ # 20_39 64
+ movl %edx,%ebp
+ xorl 8(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 32(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+ # 20_39 65
+ movl %ecx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edi,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,4(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+ # 20_39 66
+ movl %ebx,%ebp
+ xorl 16(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%esi
+ xorl %edx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,8(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+ # 20_39 67
+ movl %eax,%ebp
+ xorl 20(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edi
+ xorl %ecx,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,12(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+ # 20_39 68
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+ # 20_39 69
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+ # 20_39 70
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+ # 20_39 71
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+ # 20_39 72
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+ # 20_39 73
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+ # 20_39 74
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+ # 20_39 75
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+ # 20_39 76
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+ # 20_39 77
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+ # 20_39 78
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+ # 20_39 79
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ leal 3395469782(%edi,%edx,1),%edi
+ addl %ebp,%edi
+ movl 96(%esp),%ebp
+ movl 100(%esp),%edx
+ addl (%ebp),%edi
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%eax
+ addl 12(%ebp),%ebx
+ addl 16(%ebp),%ecx
+ movl %edi,(%ebp)
+ addl $64,%edx
+ movl %esi,4(%ebp)
+ cmpl 104(%esp),%edx
+ movl %eax,8(%ebp)
+ movl %ecx,%edi
+ movl %ebx,12(%ebp)
+ movl %edx,%esi
+ movl %ecx,16(%ebp)
+ jb L000loop
+ addl $76,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _sha1_block_data_order_ssse3
+.private_extern _sha1_block_data_order_ssse3
+.align 4
+_sha1_block_data_order_ssse3:
+L_sha1_block_data_order_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call L001pic_point
+L001pic_point:
+ popl %ebp
+ leal LK_XX_XX-L001pic_point(%ebp),%ebp
+ movdqa (%ebp),%xmm7
+ movdqa 16(%ebp),%xmm0
+ movdqa 32(%ebp),%xmm1
+ movdqa 48(%ebp),%xmm2
+ movdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ movdqa %xmm0,112(%esp)
+ movdqa %xmm1,128(%esp)
+ movdqa %xmm2,144(%esp)
+ shll $6,%edx
+ movdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ movdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ movdqu -64(%ebp),%xmm0
+ movdqu -48(%ebp),%xmm1
+ movdqu -32(%ebp),%xmm2
+ movdqu -16(%ebp),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ movdqa %xmm7,96(%esp)
+.byte 102,15,56,0,222
+ paddd %xmm7,%xmm0
+ paddd %xmm7,%xmm1
+ paddd %xmm7,%xmm2
+ movdqa %xmm0,(%esp)
+ psubd %xmm7,%xmm0
+ movdqa %xmm1,16(%esp)
+ psubd %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ xorl %edx,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebp,%esi
+ jmp L002loop
+.align 4,0x90
+L002loop:
+ rorl $2,%ebx
+ xorl %edx,%esi
+ movl %eax,%ebp
+ punpcklqdq %xmm1,%xmm4
+ movdqa %xmm3,%xmm6
+ addl (%esp),%edi
+ xorl %ecx,%ebx
+ paddd %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+ roll $5,%eax
+ addl %esi,%edi
+ psrldq $4,%xmm6
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm2,%xmm6
+ xorl %ecx,%ebp
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ pxor %xmm6,%xmm4
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ movdqa %xmm4,%xmm6
+ xorl %ebx,%esi
+ pslldq $12,%xmm0
+ paddd %xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ psrld $31,%xmm6
+ xorl %eax,%edi
+ roll $5,%edx
+ movdqa %xmm0,%xmm7
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ psrld $30,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm6,%xmm4
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ pslld $2,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ pxor %xmm0,%xmm4
+ movdqa 96(%esp),%xmm0
+ addl %ebp,%ebx
+ andl %edx,%esi
+ pxor %xmm7,%xmm4
+ pshufd $238,%xmm1,%xmm5
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ movl %ebx,%ebp
+ punpcklqdq %xmm2,%xmm5
+ movdqa %xmm4,%xmm7
+ addl 16(%esp),%eax
+ xorl %edx,%ecx
+ paddd %xmm4,%xmm0
+ movdqa %xmm1,80(%esp)
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm7
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm7
+ xorl %edx,%ebp
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ pxor %xmm7,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm0,(%esp)
+ addl %ebp,%edi
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ movdqa %xmm5,%xmm7
+ xorl %ecx,%esi
+ pslldq $12,%xmm1
+ paddd %xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ psrld $31,%xmm7
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ psrld $30,%xmm1
+ addl %edi,%edx
+ rorl $7,%edi
+ por %xmm7,%xmm5
+ xorl %ebx,%ebp
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ pslld $2,%xmm0
+ xorl %eax,%edi
+ roll $5,%edx
+ pxor %xmm1,%xmm5
+ movdqa 112(%esp),%xmm1
+ addl %ebp,%ecx
+ andl %edi,%esi
+ pxor %xmm0,%xmm5
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ punpcklqdq %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ addl 32(%esp),%ebx
+ xorl %edi,%edx
+ paddd %xmm5,%xmm1
+ movdqa %xmm2,96(%esp)
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm0
+ andl %edx,%ebp
+ xorl %edi,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm0
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ pxor %xmm0,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm1,16(%esp)
+ addl %ebp,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm0
+ xorl %edx,%esi
+ pslldq $12,%xmm2
+ paddd %xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ psrld $31,%xmm0
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm2,%xmm1
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ psrld $30,%xmm2
+ addl %eax,%edi
+ rorl $7,%eax
+ por %xmm0,%xmm6
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm0
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ pslld $2,%xmm1
+ xorl %ebx,%eax
+ roll $5,%edi
+ pxor %xmm2,%xmm6
+ movdqa 112(%esp),%xmm2
+ addl %ebp,%edx
+ andl %eax,%esi
+ pxor %xmm1,%xmm6
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%esi
+ movl %edx,%ebp
+ punpcklqdq %xmm4,%xmm7
+ movdqa %xmm6,%xmm1
+ addl 48(%esp),%ecx
+ xorl %eax,%edi
+ paddd %xmm6,%xmm2
+ movdqa %xmm3,64(%esp)
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm1
+ andl %edi,%ebp
+ xorl %eax,%edi
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm1
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ pxor %xmm1,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ movdqa %xmm2,32(%esp)
+ addl %ebp,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm1
+ xorl %edi,%esi
+ pslldq $12,%xmm3
+ paddd %xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ psrld $31,%xmm1
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm3,%xmm2
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ psrld $30,%xmm3
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm1,%xmm7
+ xorl %edx,%ebp
+ movdqa 80(%esp),%xmm1
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ pslld $2,%xmm2
+ xorl %ecx,%ebx
+ roll $5,%eax
+ pxor %xmm3,%xmm7
+ movdqa 112(%esp),%xmm3
+ addl %ebp,%edi
+ andl %ebx,%esi
+ pxor %xmm2,%xmm7
+ pshufd $238,%xmm6,%xmm2
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm3,%xmm4
+ addl %esi,%edx
+ paddd %xmm7,%xmm3
+ andl %eax,%ebp
+ pxor %xmm2,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ roll $5,%edx
+ pslld $2,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ psrld $30,%xmm2
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ xorl %edi,%edx
+ roll $5,%ecx
+ por %xmm2,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ movdqa 96(%esp),%xmm2
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ pshufd $238,%xmm7,%xmm3
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 16(%esp),%edi
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm4,%xmm5
+ rorl $7,%ebx
+ paddd %xmm0,%xmm4
+ addl %eax,%edi
+ pxor %xmm3,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm3
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm3,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ movdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ pshufd $238,%xmm0,%xmm4
+ addl %ecx,%ebx
+ addl 32(%esp),%eax
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ movdqa 128(%esp),%xmm6
+ rorl $7,%ecx
+ paddd %xmm1,%xmm5
+ addl %ebx,%eax
+ pxor %xmm4,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ pslld $2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ psrld $30,%xmm4
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ por %xmm4,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ movdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ pshufd $238,%xmm1,%xmm5
+ addl %edx,%ecx
+ addl 48(%esp),%ebx
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%edx
+ paddd %xmm2,%xmm6
+ addl %ecx,%ebx
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pslld $2,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ psrld $30,%xmm5
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ por %xmm5,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ movdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ pshufd $238,%xmm2,%xmm6
+ addl %edi,%edx
+ addl (%esp),%ecx
+ pxor %xmm0,%xmm4
+ punpcklqdq %xmm3,%xmm6
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ movdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ movdqa %xmm7,%xmm0
+ rorl $7,%edi
+ paddd %xmm3,%xmm7
+ addl %edx,%ecx
+ pxor %xmm6,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm6
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pslld $2,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ psrld $30,%xmm6
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ por %xmm6,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ pshufd $238,%xmm3,%xmm7
+ addl %eax,%edi
+ addl 16(%esp),%edx
+ pxor %xmm1,%xmm5
+ punpcklqdq %xmm4,%xmm7
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ pxor %xmm6,%xmm5
+ movdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm1
+ rorl $7,%eax
+ paddd %xmm4,%xmm0
+ addl %edi,%edx
+ pxor %xmm7,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm7
+ movdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ pslld $2,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ psrld $30,%xmm7
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ por %xmm7,%xmm5
+ addl 28(%esp),%eax
+ movdqa 80(%esp),%xmm7
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pshufd $238,%xmm4,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 32(%esp),%edi
+ pxor %xmm2,%xmm6
+ punpcklqdq %xmm5,%xmm0
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ pxor %xmm7,%xmm6
+ movdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ roll $5,%eax
+ movdqa %xmm1,%xmm2
+ addl %esi,%edi
+ paddd %xmm5,%xmm1
+ xorl %ebx,%ebp
+ pxor %xmm0,%xmm6
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ andl %ebx,%ebp
+ movdqa %xmm6,%xmm0
+ movdqa %xmm1,16(%esp)
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ pslld $2,%xmm6
+ addl %ebp,%edx
+ xorl %eax,%esi
+ psrld $30,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ por %xmm0,%xmm6
+ movl %edx,%ebp
+ xorl %eax,%esi
+ movdqa 96(%esp),%xmm0
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ pshufd $238,%xmm5,%xmm1
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 48(%esp),%eax
+ pxor %xmm3,%xmm7
+ punpcklqdq %xmm6,%xmm1
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ pxor %xmm0,%xmm7
+ movdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ roll $5,%ebx
+ movdqa 144(%esp),%xmm3
+ addl %esi,%eax
+ paddd %xmm6,%xmm2
+ xorl %ecx,%ebp
+ pxor %xmm1,%xmm7
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ andl %ecx,%ebp
+ movdqa %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ pslld $2,%xmm7
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ psrld $30,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ por %xmm1,%xmm7
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ movdqa 64(%esp),%xmm1
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ pshufd $238,%xmm6,%xmm2
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl (%esp),%ebx
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ roll $5,%ecx
+ movdqa %xmm3,%xmm4
+ addl %esi,%ebx
+ paddd %xmm7,%xmm3
+ xorl %edx,%ebp
+ pxor %xmm2,%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ andl %edx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pslld $2,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ psrld $30,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ por %xmm2,%xmm0
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ movdqa 80(%esp),%xmm2
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ pshufd $238,%xmm7,%xmm3
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 16(%esp),%ecx
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ roll $5,%edx
+ movdqa %xmm4,%xmm5
+ addl %esi,%ecx
+ paddd %xmm0,%xmm4
+ xorl %edi,%ebp
+ pxor %xmm3,%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ andl %edi,%ebp
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ pslld $2,%xmm1
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ psrld $30,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ por %xmm3,%xmm1
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ movdqa 96(%esp),%xmm3
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pshufd $238,%xmm0,%xmm4
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 32(%esp),%edx
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ roll $5,%edi
+ movdqa %xmm5,%xmm6
+ addl %esi,%edx
+ paddd %xmm1,%xmm5
+ xorl %eax,%ebp
+ pxor %xmm4,%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ andl %eax,%ebp
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ pslld $2,%xmm2
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ psrld $30,%xmm4
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ por %xmm4,%xmm2
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ movdqa 64(%esp),%xmm4
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ pshufd $238,%xmm1,%xmm5
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ addl 48(%esp),%edi
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%ebx
+ paddd %xmm2,%xmm6
+ addl %eax,%edi
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm5
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm5,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ paddd %xmm3,%xmm7
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ movdqa %xmm7,48(%esp)
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je L003done
+ movdqa 160(%esp),%xmm7
+ movdqa 176(%esp),%xmm6
+ movdqu (%ebp),%xmm0
+ movdqu 16(%ebp),%xmm1
+ movdqu 32(%ebp),%xmm2
+ movdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+.byte 102,15,56,0,198
+ movl %ebp,196(%esp)
+ movdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+.byte 102,15,56,0,206
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ paddd %xmm7,%xmm0
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ movdqa %xmm0,(%esp)
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ psubd %xmm7,%xmm0
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+.byte 102,15,56,0,214
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ paddd %xmm7,%xmm1
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ movdqa %xmm1,16(%esp)
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ psubd %xmm7,%xmm1
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+.byte 102,15,56,0,222
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ paddd %xmm7,%xmm2
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ movdqa %xmm2,32(%esp)
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %ecx,%ebx
+ movl %edx,12(%ebp)
+ xorl %edx,%ebx
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp L002loop
+.align 4,0x90
+L003done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _sha1_block_data_order_avx
+.private_extern _sha1_block_data_order_avx
+.align 4
+_sha1_block_data_order_avx:
+L_sha1_block_data_order_avx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call L004pic_point
+L004pic_point:
+ popl %ebp
+ leal LK_XX_XX-L004pic_point(%ebp),%ebp
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp L005loop
+.align 4,0x90
+L005loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je L006done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp L005loop
+.align 4,0x90
+L006done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 6,0x90
+LK_XX_XX:
+.long 1518500249,1518500249,1518500249,1518500249
+.long 1859775393,1859775393,1859775393,1859775393
+.long 2400959708,2400959708,2400959708,2400959708
+.long 3395469782,3395469782,3395469782,3395469782
+.long 66051,67438087,134810123,202182159
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/sha1-586-linux.S b/gen/bcm/sha1-586-linux.S
new file mode 100644
index 0000000..0e5754f
--- /dev/null
+++ b/gen/bcm/sha1-586-linux.S
@@ -0,0 +1,3788 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl sha1_block_data_order_nohw
+.hidden sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,@function
+.align 16
+sha1_block_data_order_nohw:
+.L_sha1_block_data_order_nohw_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%ebp
+ movl 24(%esp),%esi
+ movl 28(%esp),%eax
+ subl $76,%esp
+ shll $6,%eax
+ addl %esi,%eax
+ movl %eax,104(%esp)
+ movl 16(%ebp),%edi
+ jmp .L000loop
+.align 16
+.L000loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl %ecx,8(%esp)
+ movl %edx,12(%esp)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %edx,28(%esp)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edx,44(%esp)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,48(%esp)
+ movl %ebx,52(%esp)
+ movl %ecx,56(%esp)
+ movl %edx,60(%esp)
+ movl %esi,100(%esp)
+ movl (%ebp),%eax
+ movl 4(%ebp),%ebx
+ movl 8(%ebp),%ecx
+ movl 12(%ebp),%edx
+
+ movl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl (%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 4(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 8(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 12(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 16(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 20(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 24(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 28(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 32(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 36(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 40(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 44(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 48(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 52(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 56(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 60(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 8(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 32(%esp),%ebx
+ andl %edx,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ xorl %esi,%ebp
+ addl %ebp,%eax
+ movl %ecx,%ebp
+ rorl $2,%edx
+ movl %ebx,(%esp)
+ roll $5,%ebp
+ leal 1518500249(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edi,%ebp
+ xorl 36(%esp),%eax
+ andl %ecx,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ xorl %edi,%ebp
+ addl %ebp,%esi
+ movl %ebx,%ebp
+ rorl $2,%ecx
+ movl %eax,4(%esp)
+ roll $5,%ebp
+ leal 1518500249(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 40(%esp),%esi
+ andl %ebx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ xorl %edx,%ebp
+ addl %ebp,%edi
+ movl %eax,%ebp
+ rorl $2,%ebx
+ movl %esi,8(%esp)
+ roll $5,%ebp
+ leal 1518500249(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 44(%esp),%edi
+ andl %eax,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ xorl %ecx,%ebp
+ addl %ebp,%edx
+ movl %esi,%ebp
+ rorl $2,%eax
+ movl %edi,12(%esp)
+ roll $5,%ebp
+ leal 1518500249(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,52(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,56(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,60(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl (%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 8(%esp),%edx
+ xorl %eax,%ebp
+ xorl 32(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 4(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 56(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,4(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 8(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 16(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,8(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 12(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl (%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,12(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 16(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 24(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 4(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,16(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 20(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 28(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 8(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,20(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 24(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 32(%esp),%edx
+ xorl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 12(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,24(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 28(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 16(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,28(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 32(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl (%esp),%ebx
+ andl %edx,%ebp
+ xorl 20(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,32(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 36(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl 4(%esp),%eax
+ andl %ecx,%ebp
+ xorl 24(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,36(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 40(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 8(%esp),%esi
+ andl %ebx,%ebp
+ xorl 28(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,40(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 44(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 12(%esp),%edi
+ andl %eax,%ebp
+ xorl 32(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,44(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 48(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 16(%esp),%edx
+ andl %esi,%ebp
+ xorl 36(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,48(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 52(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 20(%esp),%ecx
+ andl %edi,%ebp
+ xorl 40(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,52(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 56(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl (%esp),%ebx
+ xorl %esi,%ebp
+ xorl 24(%esp),%ebx
+ andl %edx,%ebp
+ xorl 44(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,56(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 60(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 4(%esp),%eax
+ xorl %edi,%ebp
+ xorl 28(%esp),%eax
+ andl %ecx,%ebp
+ xorl 48(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,60(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl (%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 8(%esp),%esi
+ xorl %edx,%ebp
+ xorl 32(%esp),%esi
+ andl %ebx,%ebp
+ xorl 52(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 4(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 12(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 36(%esp),%edi
+ andl %eax,%ebp
+ xorl 56(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,4(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 8(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 16(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 40(%esp),%edx
+ andl %esi,%ebp
+ xorl 60(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,8(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 12(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 20(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 44(%esp),%ecx
+ andl %edi,%ebp
+ xorl (%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,12(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 16(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 24(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 48(%esp),%ebx
+ andl %edx,%ebp
+ xorl 4(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,16(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 20(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 28(%esp),%eax
+ xorl %edi,%ebp
+ xorl 52(%esp),%eax
+ andl %ecx,%ebp
+ xorl 8(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,20(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 24(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 32(%esp),%esi
+ xorl %edx,%ebp
+ xorl 56(%esp),%esi
+ andl %ebx,%ebp
+ xorl 12(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,24(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 28(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 36(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 60(%esp),%edi
+ andl %eax,%ebp
+ xorl 16(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,28(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 32(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 40(%esp),%edx
+ xorl %ebx,%ebp
+ xorl (%esp),%edx
+ andl %esi,%ebp
+ xorl 20(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,32(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 36(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 44(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 4(%esp),%ecx
+ andl %edi,%ebp
+ xorl 24(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,36(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 40(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 48(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 8(%esp),%ebx
+ andl %edx,%ebp
+ xorl 28(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,40(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 44(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 52(%esp),%eax
+ xorl %edi,%ebp
+ xorl 12(%esp),%eax
+ andl %ecx,%ebp
+ xorl 32(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,44(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 48(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 56(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 36(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,48(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 52(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 60(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,52(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 56(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl (%esp),%edx
+ xorl %eax,%ebp
+ xorl 24(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,56(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 60(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 4(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 48(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,60(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 8(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 32(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edi,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,4(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 16(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%esi
+ xorl %edx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,8(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 20(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edi
+ xorl %ecx,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,12(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ leal 3395469782(%edi,%edx,1),%edi
+ addl %ebp,%edi
+ movl 96(%esp),%ebp
+ movl 100(%esp),%edx
+ addl (%ebp),%edi
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%eax
+ addl 12(%ebp),%ebx
+ addl 16(%ebp),%ecx
+ movl %edi,(%ebp)
+ addl $64,%edx
+ movl %esi,4(%ebp)
+ cmpl 104(%esp),%edx
+ movl %eax,8(%ebp)
+ movl %ecx,%edi
+ movl %ebx,12(%ebp)
+ movl %edx,%esi
+ movl %ecx,16(%ebp)
+ jb .L000loop
+ addl $76,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha1_block_data_order_nohw,.-.L_sha1_block_data_order_nohw_begin
+.globl sha1_block_data_order_ssse3
+.hidden sha1_block_data_order_ssse3
+.type sha1_block_data_order_ssse3,@function
+.align 16
+sha1_block_data_order_ssse3:
+.L_sha1_block_data_order_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L001pic_point
+.L001pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L001pic_point(%ebp),%ebp
+ movdqa (%ebp),%xmm7
+ movdqa 16(%ebp),%xmm0
+ movdqa 32(%ebp),%xmm1
+ movdqa 48(%ebp),%xmm2
+ movdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ movdqa %xmm0,112(%esp)
+ movdqa %xmm1,128(%esp)
+ movdqa %xmm2,144(%esp)
+ shll $6,%edx
+ movdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ movdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ movdqu -64(%ebp),%xmm0
+ movdqu -48(%ebp),%xmm1
+ movdqu -32(%ebp),%xmm2
+ movdqu -16(%ebp),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ movdqa %xmm7,96(%esp)
+.byte 102,15,56,0,222
+ paddd %xmm7,%xmm0
+ paddd %xmm7,%xmm1
+ paddd %xmm7,%xmm2
+ movdqa %xmm0,(%esp)
+ psubd %xmm7,%xmm0
+ movdqa %xmm1,16(%esp)
+ psubd %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ xorl %edx,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebp,%esi
+ jmp .L002loop
+.align 16
+.L002loop:
+ rorl $2,%ebx
+ xorl %edx,%esi
+ movl %eax,%ebp
+ punpcklqdq %xmm1,%xmm4
+ movdqa %xmm3,%xmm6
+ addl (%esp),%edi
+ xorl %ecx,%ebx
+ paddd %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+ roll $5,%eax
+ addl %esi,%edi
+ psrldq $4,%xmm6
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm2,%xmm6
+ xorl %ecx,%ebp
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ pxor %xmm6,%xmm4
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ movdqa %xmm4,%xmm6
+ xorl %ebx,%esi
+ pslldq $12,%xmm0
+ paddd %xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ psrld $31,%xmm6
+ xorl %eax,%edi
+ roll $5,%edx
+ movdqa %xmm0,%xmm7
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ psrld $30,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm6,%xmm4
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ pslld $2,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ pxor %xmm0,%xmm4
+ movdqa 96(%esp),%xmm0
+ addl %ebp,%ebx
+ andl %edx,%esi
+ pxor %xmm7,%xmm4
+ pshufd $238,%xmm1,%xmm5
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ movl %ebx,%ebp
+ punpcklqdq %xmm2,%xmm5
+ movdqa %xmm4,%xmm7
+ addl 16(%esp),%eax
+ xorl %edx,%ecx
+ paddd %xmm4,%xmm0
+ movdqa %xmm1,80(%esp)
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm7
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm7
+ xorl %edx,%ebp
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ pxor %xmm7,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm0,(%esp)
+ addl %ebp,%edi
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ movdqa %xmm5,%xmm7
+ xorl %ecx,%esi
+ pslldq $12,%xmm1
+ paddd %xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ psrld $31,%xmm7
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ psrld $30,%xmm1
+ addl %edi,%edx
+ rorl $7,%edi
+ por %xmm7,%xmm5
+ xorl %ebx,%ebp
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ pslld $2,%xmm0
+ xorl %eax,%edi
+ roll $5,%edx
+ pxor %xmm1,%xmm5
+ movdqa 112(%esp),%xmm1
+ addl %ebp,%ecx
+ andl %edi,%esi
+ pxor %xmm0,%xmm5
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ punpcklqdq %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ addl 32(%esp),%ebx
+ xorl %edi,%edx
+ paddd %xmm5,%xmm1
+ movdqa %xmm2,96(%esp)
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm0
+ andl %edx,%ebp
+ xorl %edi,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm0
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ pxor %xmm0,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm1,16(%esp)
+ addl %ebp,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm0
+ xorl %edx,%esi
+ pslldq $12,%xmm2
+ paddd %xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ psrld $31,%xmm0
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm2,%xmm1
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ psrld $30,%xmm2
+ addl %eax,%edi
+ rorl $7,%eax
+ por %xmm0,%xmm6
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm0
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ pslld $2,%xmm1
+ xorl %ebx,%eax
+ roll $5,%edi
+ pxor %xmm2,%xmm6
+ movdqa 112(%esp),%xmm2
+ addl %ebp,%edx
+ andl %eax,%esi
+ pxor %xmm1,%xmm6
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%esi
+ movl %edx,%ebp
+ punpcklqdq %xmm4,%xmm7
+ movdqa %xmm6,%xmm1
+ addl 48(%esp),%ecx
+ xorl %eax,%edi
+ paddd %xmm6,%xmm2
+ movdqa %xmm3,64(%esp)
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm1
+ andl %edi,%ebp
+ xorl %eax,%edi
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm1
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ pxor %xmm1,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ movdqa %xmm2,32(%esp)
+ addl %ebp,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm1
+ xorl %edi,%esi
+ pslldq $12,%xmm3
+ paddd %xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ psrld $31,%xmm1
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm3,%xmm2
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ psrld $30,%xmm3
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm1,%xmm7
+ xorl %edx,%ebp
+ movdqa 80(%esp),%xmm1
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ pslld $2,%xmm2
+ xorl %ecx,%ebx
+ roll $5,%eax
+ pxor %xmm3,%xmm7
+ movdqa 112(%esp),%xmm3
+ addl %ebp,%edi
+ andl %ebx,%esi
+ pxor %xmm2,%xmm7
+ pshufd $238,%xmm6,%xmm2
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm3,%xmm4
+ addl %esi,%edx
+ paddd %xmm7,%xmm3
+ andl %eax,%ebp
+ pxor %xmm2,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ roll $5,%edx
+ pslld $2,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ psrld $30,%xmm2
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ xorl %edi,%edx
+ roll $5,%ecx
+ por %xmm2,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ movdqa 96(%esp),%xmm2
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ pshufd $238,%xmm7,%xmm3
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 16(%esp),%edi
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm4,%xmm5
+ rorl $7,%ebx
+ paddd %xmm0,%xmm4
+ addl %eax,%edi
+ pxor %xmm3,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm3
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm3,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ movdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ pshufd $238,%xmm0,%xmm4
+ addl %ecx,%ebx
+ addl 32(%esp),%eax
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ movdqa 128(%esp),%xmm6
+ rorl $7,%ecx
+ paddd %xmm1,%xmm5
+ addl %ebx,%eax
+ pxor %xmm4,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ pslld $2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ psrld $30,%xmm4
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ por %xmm4,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ movdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ pshufd $238,%xmm1,%xmm5
+ addl %edx,%ecx
+ addl 48(%esp),%ebx
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%edx
+ paddd %xmm2,%xmm6
+ addl %ecx,%ebx
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pslld $2,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ psrld $30,%xmm5
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ por %xmm5,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ movdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ pshufd $238,%xmm2,%xmm6
+ addl %edi,%edx
+ addl (%esp),%ecx
+ pxor %xmm0,%xmm4
+ punpcklqdq %xmm3,%xmm6
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ movdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ movdqa %xmm7,%xmm0
+ rorl $7,%edi
+ paddd %xmm3,%xmm7
+ addl %edx,%ecx
+ pxor %xmm6,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm6
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pslld $2,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ psrld $30,%xmm6
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ por %xmm6,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ pshufd $238,%xmm3,%xmm7
+ addl %eax,%edi
+ addl 16(%esp),%edx
+ pxor %xmm1,%xmm5
+ punpcklqdq %xmm4,%xmm7
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ pxor %xmm6,%xmm5
+ movdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm1
+ rorl $7,%eax
+ paddd %xmm4,%xmm0
+ addl %edi,%edx
+ pxor %xmm7,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm7
+ movdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ pslld $2,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ psrld $30,%xmm7
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ por %xmm7,%xmm5
+ addl 28(%esp),%eax
+ movdqa 80(%esp),%xmm7
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pshufd $238,%xmm4,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 32(%esp),%edi
+ pxor %xmm2,%xmm6
+ punpcklqdq %xmm5,%xmm0
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ pxor %xmm7,%xmm6
+ movdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ roll $5,%eax
+ movdqa %xmm1,%xmm2
+ addl %esi,%edi
+ paddd %xmm5,%xmm1
+ xorl %ebx,%ebp
+ pxor %xmm0,%xmm6
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ andl %ebx,%ebp
+ movdqa %xmm6,%xmm0
+ movdqa %xmm1,16(%esp)
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ pslld $2,%xmm6
+ addl %ebp,%edx
+ xorl %eax,%esi
+ psrld $30,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ por %xmm0,%xmm6
+ movl %edx,%ebp
+ xorl %eax,%esi
+ movdqa 96(%esp),%xmm0
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ pshufd $238,%xmm5,%xmm1
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 48(%esp),%eax
+ pxor %xmm3,%xmm7
+ punpcklqdq %xmm6,%xmm1
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ pxor %xmm0,%xmm7
+ movdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ roll $5,%ebx
+ movdqa 144(%esp),%xmm3
+ addl %esi,%eax
+ paddd %xmm6,%xmm2
+ xorl %ecx,%ebp
+ pxor %xmm1,%xmm7
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ andl %ecx,%ebp
+ movdqa %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ pslld $2,%xmm7
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ psrld $30,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ por %xmm1,%xmm7
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ movdqa 64(%esp),%xmm1
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ pshufd $238,%xmm6,%xmm2
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl (%esp),%ebx
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ roll $5,%ecx
+ movdqa %xmm3,%xmm4
+ addl %esi,%ebx
+ paddd %xmm7,%xmm3
+ xorl %edx,%ebp
+ pxor %xmm2,%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ andl %edx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pslld $2,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ psrld $30,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ por %xmm2,%xmm0
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ movdqa 80(%esp),%xmm2
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ pshufd $238,%xmm7,%xmm3
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 16(%esp),%ecx
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ roll $5,%edx
+ movdqa %xmm4,%xmm5
+ addl %esi,%ecx
+ paddd %xmm0,%xmm4
+ xorl %edi,%ebp
+ pxor %xmm3,%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ andl %edi,%ebp
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ pslld $2,%xmm1
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ psrld $30,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ por %xmm3,%xmm1
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ movdqa 96(%esp),%xmm3
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pshufd $238,%xmm0,%xmm4
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 32(%esp),%edx
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ roll $5,%edi
+ movdqa %xmm5,%xmm6
+ addl %esi,%edx
+ paddd %xmm1,%xmm5
+ xorl %eax,%ebp
+ pxor %xmm4,%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ andl %eax,%ebp
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ pslld $2,%xmm2
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ psrld $30,%xmm4
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ por %xmm4,%xmm2
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ movdqa 64(%esp),%xmm4
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ pshufd $238,%xmm1,%xmm5
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ addl 48(%esp),%edi
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%ebx
+ paddd %xmm2,%xmm6
+ addl %eax,%edi
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm5
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm5,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ paddd %xmm3,%xmm7
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ movdqa %xmm7,48(%esp)
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L003done
+ movdqa 160(%esp),%xmm7
+ movdqa 176(%esp),%xmm6
+ movdqu (%ebp),%xmm0
+ movdqu 16(%ebp),%xmm1
+ movdqu 32(%ebp),%xmm2
+ movdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+.byte 102,15,56,0,198
+ movl %ebp,196(%esp)
+ movdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+.byte 102,15,56,0,206
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ paddd %xmm7,%xmm0
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ movdqa %xmm0,(%esp)
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ psubd %xmm7,%xmm0
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+.byte 102,15,56,0,214
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ paddd %xmm7,%xmm1
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ movdqa %xmm1,16(%esp)
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ psubd %xmm7,%xmm1
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+.byte 102,15,56,0,222
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ paddd %xmm7,%xmm2
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ movdqa %xmm2,32(%esp)
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %ecx,%ebx
+ movl %edx,12(%ebp)
+ xorl %edx,%ebx
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L002loop
+.align 16
+.L003done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha1_block_data_order_ssse3,.-.L_sha1_block_data_order_ssse3_begin
+.globl sha1_block_data_order_avx
+.hidden sha1_block_data_order_avx
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+.L_sha1_block_data_order_avx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L004pic_point
+.L004pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L004pic_point(%ebp),%ebp
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L005loop
+.align 16
+.L005loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L006done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L005loop
+.align 16
+.L006done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha1_block_data_order_avx,.-.L_sha1_block_data_order_avx_begin
+.align 64
+.LK_XX_XX:
+.long 1518500249,1518500249,1518500249,1518500249
+.long 1859775393,1859775393,1859775393,1859775393
+.long 2400959708,2400959708,2400959708,2400959708
+.long 3395469782,3395469782,3395469782,3395469782
+.long 66051,67438087,134810123,202182159
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/sha1-586-win.asm b/gen/bcm/sha1-586-win.asm
new file mode 100644
index 0000000..c8823a9
--- /dev/null
+++ b/gen/bcm/sha1-586-win.asm
@@ -0,0 +1,3790 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _sha1_block_data_order_nohw
+align 16
+_sha1_block_data_order_nohw:
+L$_sha1_block_data_order_nohw_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ebp,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ sub esp,76
+ shl eax,6
+ add eax,esi
+ mov DWORD [104+esp],eax
+ mov edi,DWORD [16+ebp]
+ jmp NEAR L$000loop
+align 16
+L$000loop:
+ mov eax,DWORD [esi]
+ mov ebx,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov edx,DWORD [12+esi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ mov DWORD [esp],eax
+ mov DWORD [4+esp],ebx
+ mov DWORD [8+esp],ecx
+ mov DWORD [12+esp],edx
+ mov eax,DWORD [16+esi]
+ mov ebx,DWORD [20+esi]
+ mov ecx,DWORD [24+esi]
+ mov edx,DWORD [28+esi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ mov DWORD [16+esp],eax
+ mov DWORD [20+esp],ebx
+ mov DWORD [24+esp],ecx
+ mov DWORD [28+esp],edx
+ mov eax,DWORD [32+esi]
+ mov ebx,DWORD [36+esi]
+ mov ecx,DWORD [40+esi]
+ mov edx,DWORD [44+esi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ mov DWORD [32+esp],eax
+ mov DWORD [36+esp],ebx
+ mov DWORD [40+esp],ecx
+ mov DWORD [44+esp],edx
+ mov eax,DWORD [48+esi]
+ mov ebx,DWORD [52+esi]
+ mov ecx,DWORD [56+esi]
+ mov edx,DWORD [60+esi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ mov DWORD [48+esp],eax
+ mov DWORD [52+esp],ebx
+ mov DWORD [56+esp],ecx
+ mov DWORD [60+esp],edx
+ mov DWORD [100+esp],esi
+ mov eax,DWORD [ebp]
+ mov ebx,DWORD [4+ebp]
+ mov ecx,DWORD [8+ebp]
+ mov edx,DWORD [12+ebp]
+ ; 00_15 0
+ mov esi,ecx
+ mov ebp,eax
+ rol ebp,5
+ xor esi,edx
+ add ebp,edi
+ mov edi,DWORD [esp]
+ and esi,ebx
+ ror ebx,2
+ xor esi,edx
+ lea ebp,[1518500249+edi*1+ebp]
+ add ebp,esi
+ ; 00_15 1
+ mov edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ xor edi,ecx
+ add ebp,edx
+ mov edx,DWORD [4+esp]
+ and edi,eax
+ ror eax,2
+ xor edi,ecx
+ lea ebp,[1518500249+edx*1+ebp]
+ add ebp,edi
+ ; 00_15 2
+ mov edx,eax
+ mov edi,ebp
+ rol ebp,5
+ xor edx,ebx
+ add ebp,ecx
+ mov ecx,DWORD [8+esp]
+ and edx,esi
+ ror esi,2
+ xor edx,ebx
+ lea ebp,[1518500249+ecx*1+ebp]
+ add ebp,edx
+ ; 00_15 3
+ mov ecx,esi
+ mov edx,ebp
+ rol ebp,5
+ xor ecx,eax
+ add ebp,ebx
+ mov ebx,DWORD [12+esp]
+ and ecx,edi
+ ror edi,2
+ xor ecx,eax
+ lea ebp,[1518500249+ebx*1+ebp]
+ add ebp,ecx
+ ; 00_15 4
+ mov ebx,edi
+ mov ecx,ebp
+ rol ebp,5
+ xor ebx,esi
+ add ebp,eax
+ mov eax,DWORD [16+esp]
+ and ebx,edx
+ ror edx,2
+ xor ebx,esi
+ lea ebp,[1518500249+eax*1+ebp]
+ add ebp,ebx
+ ; 00_15 5
+ mov eax,edx
+ mov ebx,ebp
+ rol ebp,5
+ xor eax,edi
+ add ebp,esi
+ mov esi,DWORD [20+esp]
+ and eax,ecx
+ ror ecx,2
+ xor eax,edi
+ lea ebp,[1518500249+esi*1+ebp]
+ add ebp,eax
+ ; 00_15 6
+ mov esi,ecx
+ mov eax,ebp
+ rol ebp,5
+ xor esi,edx
+ add ebp,edi
+ mov edi,DWORD [24+esp]
+ and esi,ebx
+ ror ebx,2
+ xor esi,edx
+ lea ebp,[1518500249+edi*1+ebp]
+ add ebp,esi
+ ; 00_15 7
+ mov edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ xor edi,ecx
+ add ebp,edx
+ mov edx,DWORD [28+esp]
+ and edi,eax
+ ror eax,2
+ xor edi,ecx
+ lea ebp,[1518500249+edx*1+ebp]
+ add ebp,edi
+ ; 00_15 8
+ mov edx,eax
+ mov edi,ebp
+ rol ebp,5
+ xor edx,ebx
+ add ebp,ecx
+ mov ecx,DWORD [32+esp]
+ and edx,esi
+ ror esi,2
+ xor edx,ebx
+ lea ebp,[1518500249+ecx*1+ebp]
+ add ebp,edx
+ ; 00_15 9
+ mov ecx,esi
+ mov edx,ebp
+ rol ebp,5
+ xor ecx,eax
+ add ebp,ebx
+ mov ebx,DWORD [36+esp]
+ and ecx,edi
+ ror edi,2
+ xor ecx,eax
+ lea ebp,[1518500249+ebx*1+ebp]
+ add ebp,ecx
+ ; 00_15 10
+ mov ebx,edi
+ mov ecx,ebp
+ rol ebp,5
+ xor ebx,esi
+ add ebp,eax
+ mov eax,DWORD [40+esp]
+ and ebx,edx
+ ror edx,2
+ xor ebx,esi
+ lea ebp,[1518500249+eax*1+ebp]
+ add ebp,ebx
+ ; 00_15 11
+ mov eax,edx
+ mov ebx,ebp
+ rol ebp,5
+ xor eax,edi
+ add ebp,esi
+ mov esi,DWORD [44+esp]
+ and eax,ecx
+ ror ecx,2
+ xor eax,edi
+ lea ebp,[1518500249+esi*1+ebp]
+ add ebp,eax
+ ; 00_15 12
+ mov esi,ecx
+ mov eax,ebp
+ rol ebp,5
+ xor esi,edx
+ add ebp,edi
+ mov edi,DWORD [48+esp]
+ and esi,ebx
+ ror ebx,2
+ xor esi,edx
+ lea ebp,[1518500249+edi*1+ebp]
+ add ebp,esi
+ ; 00_15 13
+ mov edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ xor edi,ecx
+ add ebp,edx
+ mov edx,DWORD [52+esp]
+ and edi,eax
+ ror eax,2
+ xor edi,ecx
+ lea ebp,[1518500249+edx*1+ebp]
+ add ebp,edi
+ ; 00_15 14
+ mov edx,eax
+ mov edi,ebp
+ rol ebp,5
+ xor edx,ebx
+ add ebp,ecx
+ mov ecx,DWORD [56+esp]
+ and edx,esi
+ ror esi,2
+ xor edx,ebx
+ lea ebp,[1518500249+ecx*1+ebp]
+ add ebp,edx
+ ; 00_15 15
+ mov ecx,esi
+ mov edx,ebp
+ rol ebp,5
+ xor ecx,eax
+ add ebp,ebx
+ mov ebx,DWORD [60+esp]
+ and ecx,edi
+ ror edi,2
+ xor ecx,eax
+ lea ebp,[1518500249+ebx*1+ebp]
+ mov ebx,DWORD [esp]
+ add ecx,ebp
+ ; 16_19 16
+ mov ebp,edi
+ xor ebx,DWORD [8+esp]
+ xor ebp,esi
+ xor ebx,DWORD [32+esp]
+ and ebp,edx
+ xor ebx,DWORD [52+esp]
+ rol ebx,1
+ xor ebp,esi
+ add eax,ebp
+ mov ebp,ecx
+ ror edx,2
+ mov DWORD [esp],ebx
+ rol ebp,5
+ lea ebx,[1518500249+eax*1+ebx]
+ mov eax,DWORD [4+esp]
+ add ebx,ebp
+ ; 16_19 17
+ mov ebp,edx
+ xor eax,DWORD [12+esp]
+ xor ebp,edi
+ xor eax,DWORD [36+esp]
+ and ebp,ecx
+ xor eax,DWORD [56+esp]
+ rol eax,1
+ xor ebp,edi
+ add esi,ebp
+ mov ebp,ebx
+ ror ecx,2
+ mov DWORD [4+esp],eax
+ rol ebp,5
+ lea eax,[1518500249+esi*1+eax]
+ mov esi,DWORD [8+esp]
+ add eax,ebp
+ ; 16_19 18
+ mov ebp,ecx
+ xor esi,DWORD [16+esp]
+ xor ebp,edx
+ xor esi,DWORD [40+esp]
+ and ebp,ebx
+ xor esi,DWORD [60+esp]
+ rol esi,1
+ xor ebp,edx
+ add edi,ebp
+ mov ebp,eax
+ ror ebx,2
+ mov DWORD [8+esp],esi
+ rol ebp,5
+ lea esi,[1518500249+edi*1+esi]
+ mov edi,DWORD [12+esp]
+ add esi,ebp
+ ; 16_19 19
+ mov ebp,ebx
+ xor edi,DWORD [20+esp]
+ xor ebp,ecx
+ xor edi,DWORD [44+esp]
+ and ebp,eax
+ xor edi,DWORD [esp]
+ rol edi,1
+ xor ebp,ecx
+ add edx,ebp
+ mov ebp,esi
+ ror eax,2
+ mov DWORD [12+esp],edi
+ rol ebp,5
+ lea edi,[1518500249+edx*1+edi]
+ mov edx,DWORD [16+esp]
+ add edi,ebp
+ ; 20_39 20
+ mov ebp,esi
+ xor edx,DWORD [24+esp]
+ xor ebp,eax
+ xor edx,DWORD [48+esp]
+ xor ebp,ebx
+ xor edx,DWORD [4+esp]
+ rol edx,1
+ add ecx,ebp
+ ror esi,2
+ mov ebp,edi
+ rol ebp,5
+ mov DWORD [16+esp],edx
+ lea edx,[1859775393+ecx*1+edx]
+ mov ecx,DWORD [20+esp]
+ add edx,ebp
+ ; 20_39 21
+ mov ebp,edi
+ xor ecx,DWORD [28+esp]
+ xor ebp,esi
+ xor ecx,DWORD [52+esp]
+ xor ebp,eax
+ xor ecx,DWORD [8+esp]
+ rol ecx,1
+ add ebx,ebp
+ ror edi,2
+ mov ebp,edx
+ rol ebp,5
+ mov DWORD [20+esp],ecx
+ lea ecx,[1859775393+ebx*1+ecx]
+ mov ebx,DWORD [24+esp]
+ add ecx,ebp
+ ; 20_39 22
+ mov ebp,edx
+ xor ebx,DWORD [32+esp]
+ xor ebp,edi
+ xor ebx,DWORD [56+esp]
+ xor ebp,esi
+ xor ebx,DWORD [12+esp]
+ rol ebx,1
+ add eax,ebp
+ ror edx,2
+ mov ebp,ecx
+ rol ebp,5
+ mov DWORD [24+esp],ebx
+ lea ebx,[1859775393+eax*1+ebx]
+ mov eax,DWORD [28+esp]
+ add ebx,ebp
+ ; 20_39 23
+ mov ebp,ecx
+ xor eax,DWORD [36+esp]
+ xor ebp,edx
+ xor eax,DWORD [60+esp]
+ xor ebp,edi
+ xor eax,DWORD [16+esp]
+ rol eax,1
+ add esi,ebp
+ ror ecx,2
+ mov ebp,ebx
+ rol ebp,5
+ mov DWORD [28+esp],eax
+ lea eax,[1859775393+esi*1+eax]
+ mov esi,DWORD [32+esp]
+ add eax,ebp
+ ; 20_39 24
+ mov ebp,ebx
+ xor esi,DWORD [40+esp]
+ xor ebp,ecx
+ xor esi,DWORD [esp]
+ xor ebp,edx
+ xor esi,DWORD [20+esp]
+ rol esi,1
+ add edi,ebp
+ ror ebx,2
+ mov ebp,eax
+ rol ebp,5
+ mov DWORD [32+esp],esi
+ lea esi,[1859775393+edi*1+esi]
+ mov edi,DWORD [36+esp]
+ add esi,ebp
+ ; 20_39 25
+ mov ebp,eax
+ xor edi,DWORD [44+esp]
+ xor ebp,ebx
+ xor edi,DWORD [4+esp]
+ xor ebp,ecx
+ xor edi,DWORD [24+esp]
+ rol edi,1
+ add edx,ebp
+ ror eax,2
+ mov ebp,esi
+ rol ebp,5
+ mov DWORD [36+esp],edi
+ lea edi,[1859775393+edx*1+edi]
+ mov edx,DWORD [40+esp]
+ add edi,ebp
+ ; 20_39 26
+ mov ebp,esi
+ xor edx,DWORD [48+esp]
+ xor ebp,eax
+ xor edx,DWORD [8+esp]
+ xor ebp,ebx
+ xor edx,DWORD [28+esp]
+ rol edx,1
+ add ecx,ebp
+ ror esi,2
+ mov ebp,edi
+ rol ebp,5
+ mov DWORD [40+esp],edx
+ lea edx,[1859775393+ecx*1+edx]
+ mov ecx,DWORD [44+esp]
+ add edx,ebp
+ ; 20_39 27
+ mov ebp,edi
+ xor ecx,DWORD [52+esp]
+ xor ebp,esi
+ xor ecx,DWORD [12+esp]
+ xor ebp,eax
+ xor ecx,DWORD [32+esp]
+ rol ecx,1
+ add ebx,ebp
+ ror edi,2
+ mov ebp,edx
+ rol ebp,5
+ mov DWORD [44+esp],ecx
+ lea ecx,[1859775393+ebx*1+ecx]
+ mov ebx,DWORD [48+esp]
+ add ecx,ebp
+ ; 20_39 28
+ mov ebp,edx
+ xor ebx,DWORD [56+esp]
+ xor ebp,edi
+ xor ebx,DWORD [16+esp]
+ xor ebp,esi
+ xor ebx,DWORD [36+esp]
+ rol ebx,1
+ add eax,ebp
+ ror edx,2
+ mov ebp,ecx
+ rol ebp,5
+ mov DWORD [48+esp],ebx
+ lea ebx,[1859775393+eax*1+ebx]
+ mov eax,DWORD [52+esp]
+ add ebx,ebp
+ ; 20_39 29
+ mov ebp,ecx
+ xor eax,DWORD [60+esp]
+ xor ebp,edx
+ xor eax,DWORD [20+esp]
+ xor ebp,edi
+ xor eax,DWORD [40+esp]
+ rol eax,1
+ add esi,ebp
+ ror ecx,2
+ mov ebp,ebx
+ rol ebp,5
+ mov DWORD [52+esp],eax
+ lea eax,[1859775393+esi*1+eax]
+ mov esi,DWORD [56+esp]
+ add eax,ebp
+ ; 20_39 30
+ mov ebp,ebx
+ xor esi,DWORD [esp]
+ xor ebp,ecx
+ xor esi,DWORD [24+esp]
+ xor ebp,edx
+ xor esi,DWORD [44+esp]
+ rol esi,1
+ add edi,ebp
+ ror ebx,2
+ mov ebp,eax
+ rol ebp,5
+ mov DWORD [56+esp],esi
+ lea esi,[1859775393+edi*1+esi]
+ mov edi,DWORD [60+esp]
+ add esi,ebp
+ ; 20_39 31
+ mov ebp,eax
+ xor edi,DWORD [4+esp]
+ xor ebp,ebx
+ xor edi,DWORD [28+esp]
+ xor ebp,ecx
+ xor edi,DWORD [48+esp]
+ rol edi,1
+ add edx,ebp
+ ror eax,2
+ mov ebp,esi
+ rol ebp,5
+ mov DWORD [60+esp],edi
+ lea edi,[1859775393+edx*1+edi]
+ mov edx,DWORD [esp]
+ add edi,ebp
+ ; 20_39 32
+ mov ebp,esi
+ xor edx,DWORD [8+esp]
+ xor ebp,eax
+ xor edx,DWORD [32+esp]
+ xor ebp,ebx
+ xor edx,DWORD [52+esp]
+ rol edx,1
+ add ecx,ebp
+ ror esi,2
+ mov ebp,edi
+ rol ebp,5
+ mov DWORD [esp],edx
+ lea edx,[1859775393+ecx*1+edx]
+ mov ecx,DWORD [4+esp]
+ add edx,ebp
+ ; 20_39 33
+ mov ebp,edi
+ xor ecx,DWORD [12+esp]
+ xor ebp,esi
+ xor ecx,DWORD [36+esp]
+ xor ebp,eax
+ xor ecx,DWORD [56+esp]
+ rol ecx,1
+ add ebx,ebp
+ ror edi,2
+ mov ebp,edx
+ rol ebp,5
+ mov DWORD [4+esp],ecx
+ lea ecx,[1859775393+ebx*1+ecx]
+ mov ebx,DWORD [8+esp]
+ add ecx,ebp
+ ; 20_39 34
+ mov ebp,edx
+ xor ebx,DWORD [16+esp]
+ xor ebp,edi
+ xor ebx,DWORD [40+esp]
+ xor ebp,esi
+ xor ebx,DWORD [60+esp]
+ rol ebx,1
+ add eax,ebp
+ ror edx,2
+ mov ebp,ecx
+ rol ebp,5
+ mov DWORD [8+esp],ebx
+ lea ebx,[1859775393+eax*1+ebx]
+ mov eax,DWORD [12+esp]
+ add ebx,ebp
+ ; 20_39 35
+ mov ebp,ecx
+ xor eax,DWORD [20+esp]
+ xor ebp,edx
+ xor eax,DWORD [44+esp]
+ xor ebp,edi
+ xor eax,DWORD [esp]
+ rol eax,1
+ add esi,ebp
+ ror ecx,2
+ mov ebp,ebx
+ rol ebp,5
+ mov DWORD [12+esp],eax
+ lea eax,[1859775393+esi*1+eax]
+ mov esi,DWORD [16+esp]
+ add eax,ebp
+ ; 20_39 36
+ mov ebp,ebx
+ xor esi,DWORD [24+esp]
+ xor ebp,ecx
+ xor esi,DWORD [48+esp]
+ xor ebp,edx
+ xor esi,DWORD [4+esp]
+ rol esi,1
+ add edi,ebp
+ ror ebx,2
+ mov ebp,eax
+ rol ebp,5
+ mov DWORD [16+esp],esi
+ lea esi,[1859775393+edi*1+esi]
+ mov edi,DWORD [20+esp]
+ add esi,ebp
+ ; 20_39 37
+ mov ebp,eax
+ xor edi,DWORD [28+esp]
+ xor ebp,ebx
+ xor edi,DWORD [52+esp]
+ xor ebp,ecx
+ xor edi,DWORD [8+esp]
+ rol edi,1
+ add edx,ebp
+ ror eax,2
+ mov ebp,esi
+ rol ebp,5
+ mov DWORD [20+esp],edi
+ lea edi,[1859775393+edx*1+edi]
+ mov edx,DWORD [24+esp]
+ add edi,ebp
+ ; 20_39 38
+ mov ebp,esi
+ xor edx,DWORD [32+esp]
+ xor ebp,eax
+ xor edx,DWORD [56+esp]
+ xor ebp,ebx
+ xor edx,DWORD [12+esp]
+ rol edx,1
+ add ecx,ebp
+ ror esi,2
+ mov ebp,edi
+ rol ebp,5
+ mov DWORD [24+esp],edx
+ lea edx,[1859775393+ecx*1+edx]
+ mov ecx,DWORD [28+esp]
+ add edx,ebp
+ ; 20_39 39
+ mov ebp,edi
+ xor ecx,DWORD [36+esp]
+ xor ebp,esi
+ xor ecx,DWORD [60+esp]
+ xor ebp,eax
+ xor ecx,DWORD [16+esp]
+ rol ecx,1
+ add ebx,ebp
+ ror edi,2
+ mov ebp,edx
+ rol ebp,5
+ mov DWORD [28+esp],ecx
+ lea ecx,[1859775393+ebx*1+ecx]
+ mov ebx,DWORD [32+esp]
+ add ecx,ebp
+ ; 40_59 40
+ mov ebp,edi
+ xor ebx,DWORD [40+esp]
+ xor ebp,esi
+ xor ebx,DWORD [esp]
+ and ebp,edx
+ xor ebx,DWORD [20+esp]
+ rol ebx,1
+ add ebp,eax
+ ror edx,2
+ mov eax,ecx
+ rol eax,5
+ mov DWORD [32+esp],ebx
+ lea ebx,[2400959708+ebp*1+ebx]
+ mov ebp,edi
+ add ebx,eax
+ and ebp,esi
+ mov eax,DWORD [36+esp]
+ add ebx,ebp
+ ; 40_59 41
+ mov ebp,edx
+ xor eax,DWORD [44+esp]
+ xor ebp,edi
+ xor eax,DWORD [4+esp]
+ and ebp,ecx
+ xor eax,DWORD [24+esp]
+ rol eax,1
+ add ebp,esi
+ ror ecx,2
+ mov esi,ebx
+ rol esi,5
+ mov DWORD [36+esp],eax
+ lea eax,[2400959708+ebp*1+eax]
+ mov ebp,edx
+ add eax,esi
+ and ebp,edi
+ mov esi,DWORD [40+esp]
+ add eax,ebp
+ ; 40_59 42
+ mov ebp,ecx
+ xor esi,DWORD [48+esp]
+ xor ebp,edx
+ xor esi,DWORD [8+esp]
+ and ebp,ebx
+ xor esi,DWORD [28+esp]
+ rol esi,1
+ add ebp,edi
+ ror ebx,2
+ mov edi,eax
+ rol edi,5
+ mov DWORD [40+esp],esi
+ lea esi,[2400959708+ebp*1+esi]
+ mov ebp,ecx
+ add esi,edi
+ and ebp,edx
+ mov edi,DWORD [44+esp]
+ add esi,ebp
+ ; 40_59 43
+ mov ebp,ebx
+ xor edi,DWORD [52+esp]
+ xor ebp,ecx
+ xor edi,DWORD [12+esp]
+ and ebp,eax
+ xor edi,DWORD [32+esp]
+ rol edi,1
+ add ebp,edx
+ ror eax,2
+ mov edx,esi
+ rol edx,5
+ mov DWORD [44+esp],edi
+ lea edi,[2400959708+ebp*1+edi]
+ mov ebp,ebx
+ add edi,edx
+ and ebp,ecx
+ mov edx,DWORD [48+esp]
+ add edi,ebp
+ ; 40_59 44
+ mov ebp,eax
+ xor edx,DWORD [56+esp]
+ xor ebp,ebx
+ xor edx,DWORD [16+esp]
+ and ebp,esi
+ xor edx,DWORD [36+esp]
+ rol edx,1
+ add ebp,ecx
+ ror esi,2
+ mov ecx,edi
+ rol ecx,5
+ mov DWORD [48+esp],edx
+ lea edx,[2400959708+ebp*1+edx]
+ mov ebp,eax
+ add edx,ecx
+ and ebp,ebx
+ mov ecx,DWORD [52+esp]
+ add edx,ebp
+ ; 40_59 45
+ mov ebp,esi
+ xor ecx,DWORD [60+esp]
+ xor ebp,eax
+ xor ecx,DWORD [20+esp]
+ and ebp,edi
+ xor ecx,DWORD [40+esp]
+ rol ecx,1
+ add ebp,ebx
+ ror edi,2
+ mov ebx,edx
+ rol ebx,5
+ mov DWORD [52+esp],ecx
+ lea ecx,[2400959708+ebp*1+ecx]
+ mov ebp,esi
+ add ecx,ebx
+ and ebp,eax
+ mov ebx,DWORD [56+esp]
+ add ecx,ebp
+ ; 40_59 46
+ mov ebp,edi
+ xor ebx,DWORD [esp]
+ xor ebp,esi
+ xor ebx,DWORD [24+esp]
+ and ebp,edx
+ xor ebx,DWORD [44+esp]
+ rol ebx,1
+ add ebp,eax
+ ror edx,2
+ mov eax,ecx
+ rol eax,5
+ mov DWORD [56+esp],ebx
+ lea ebx,[2400959708+ebp*1+ebx]
+ mov ebp,edi
+ add ebx,eax
+ and ebp,esi
+ mov eax,DWORD [60+esp]
+ add ebx,ebp
+ ; 40_59 47
+ mov ebp,edx
+ xor eax,DWORD [4+esp]
+ xor ebp,edi
+ xor eax,DWORD [28+esp]
+ and ebp,ecx
+ xor eax,DWORD [48+esp]
+ rol eax,1
+ add ebp,esi
+ ror ecx,2
+ mov esi,ebx
+ rol esi,5
+ mov DWORD [60+esp],eax
+ lea eax,[2400959708+ebp*1+eax]
+ mov ebp,edx
+ add eax,esi
+ and ebp,edi
+ mov esi,DWORD [esp]
+ add eax,ebp
+ ; 40_59 48
+ mov ebp,ecx
+ xor esi,DWORD [8+esp]
+ xor ebp,edx
+ xor esi,DWORD [32+esp]
+ and ebp,ebx
+ xor esi,DWORD [52+esp]
+ rol esi,1
+ add ebp,edi
+ ror ebx,2
+ mov edi,eax
+ rol edi,5
+ mov DWORD [esp],esi
+ lea esi,[2400959708+ebp*1+esi]
+ mov ebp,ecx
+ add esi,edi
+ and ebp,edx
+ mov edi,DWORD [4+esp]
+ add esi,ebp
+ ; 40_59 49
+ mov ebp,ebx
+ xor edi,DWORD [12+esp]
+ xor ebp,ecx
+ xor edi,DWORD [36+esp]
+ and ebp,eax
+ xor edi,DWORD [56+esp]
+ rol edi,1
+ add ebp,edx
+ ror eax,2
+ mov edx,esi
+ rol edx,5
+ mov DWORD [4+esp],edi
+ lea edi,[2400959708+ebp*1+edi]
+ mov ebp,ebx
+ add edi,edx
+ and ebp,ecx
+ mov edx,DWORD [8+esp]
+ add edi,ebp
+ ; 40_59 50
+ mov ebp,eax
+ xor edx,DWORD [16+esp]
+ xor ebp,ebx
+ xor edx,DWORD [40+esp]
+ and ebp,esi
+ xor edx,DWORD [60+esp]
+ rol edx,1
+ add ebp,ecx
+ ror esi,2
+ mov ecx,edi
+ rol ecx,5
+ mov DWORD [8+esp],edx
+ lea edx,[2400959708+ebp*1+edx]
+ mov ebp,eax
+ add edx,ecx
+ and ebp,ebx
+ mov ecx,DWORD [12+esp]
+ add edx,ebp
+ ; 40_59 51
+ mov ebp,esi
+ xor ecx,DWORD [20+esp]
+ xor ebp,eax
+ xor ecx,DWORD [44+esp]
+ and ebp,edi
+ xor ecx,DWORD [esp]
+ rol ecx,1
+ add ebp,ebx
+ ror edi,2
+ mov ebx,edx
+ rol ebx,5
+ mov DWORD [12+esp],ecx
+ lea ecx,[2400959708+ebp*1+ecx]
+ mov ebp,esi
+ add ecx,ebx
+ and ebp,eax
+ mov ebx,DWORD [16+esp]
+ add ecx,ebp
+ ; 40_59 52
+ mov ebp,edi
+ xor ebx,DWORD [24+esp]
+ xor ebp,esi
+ xor ebx,DWORD [48+esp]
+ and ebp,edx
+ xor ebx,DWORD [4+esp]
+ rol ebx,1
+ add ebp,eax
+ ror edx,2
+ mov eax,ecx
+ rol eax,5
+ mov DWORD [16+esp],ebx
+ lea ebx,[2400959708+ebp*1+ebx]
+ mov ebp,edi
+ add ebx,eax
+ and ebp,esi
+ mov eax,DWORD [20+esp]
+ add ebx,ebp
+ ; 40_59 53
+ mov ebp,edx
+ xor eax,DWORD [28+esp]
+ xor ebp,edi
+ xor eax,DWORD [52+esp]
+ and ebp,ecx
+ xor eax,DWORD [8+esp]
+ rol eax,1
+ add ebp,esi
+ ror ecx,2
+ mov esi,ebx
+ rol esi,5
+ mov DWORD [20+esp],eax
+ lea eax,[2400959708+ebp*1+eax]
+ mov ebp,edx
+ add eax,esi
+ and ebp,edi
+ mov esi,DWORD [24+esp]
+ add eax,ebp
+ ; 40_59 54
+ mov ebp,ecx
+ xor esi,DWORD [32+esp]
+ xor ebp,edx
+ xor esi,DWORD [56+esp]
+ and ebp,ebx
+ xor esi,DWORD [12+esp]
+ rol esi,1
+ add ebp,edi
+ ror ebx,2
+ mov edi,eax
+ rol edi,5
+ mov DWORD [24+esp],esi
+ lea esi,[2400959708+ebp*1+esi]
+ mov ebp,ecx
+ add esi,edi
+ and ebp,edx
+ mov edi,DWORD [28+esp]
+ add esi,ebp
+ ; 40_59 55
+ mov ebp,ebx
+ xor edi,DWORD [36+esp]
+ xor ebp,ecx
+ xor edi,DWORD [60+esp]
+ and ebp,eax
+ xor edi,DWORD [16+esp]
+ rol edi,1
+ add ebp,edx
+ ror eax,2
+ mov edx,esi
+ rol edx,5
+ mov DWORD [28+esp],edi
+ lea edi,[2400959708+ebp*1+edi]
+ mov ebp,ebx
+ add edi,edx
+ and ebp,ecx
+ mov edx,DWORD [32+esp]
+ add edi,ebp
+ ; 40_59 56
+ mov ebp,eax
+ xor edx,DWORD [40+esp]
+ xor ebp,ebx
+ xor edx,DWORD [esp]
+ and ebp,esi
+ xor edx,DWORD [20+esp]
+ rol edx,1
+ add ebp,ecx
+ ror esi,2
+ mov ecx,edi
+ rol ecx,5
+ mov DWORD [32+esp],edx
+ lea edx,[2400959708+ebp*1+edx]
+ mov ebp,eax
+ add edx,ecx
+ and ebp,ebx
+ mov ecx,DWORD [36+esp]
+ add edx,ebp
+ ; 40_59 57
+ mov ebp,esi
+ xor ecx,DWORD [44+esp]
+ xor ebp,eax
+ xor ecx,DWORD [4+esp]
+ and ebp,edi
+ xor ecx,DWORD [24+esp]
+ rol ecx,1
+ add ebp,ebx
+ ror edi,2
+ mov ebx,edx
+ rol ebx,5
+ mov DWORD [36+esp],ecx
+ lea ecx,[2400959708+ebp*1+ecx]
+ mov ebp,esi
+ add ecx,ebx
+ and ebp,eax
+ mov ebx,DWORD [40+esp]
+ add ecx,ebp
+ ; 40_59 58
+ mov ebp,edi
+ xor ebx,DWORD [48+esp]
+ xor ebp,esi
+ xor ebx,DWORD [8+esp]
+ and ebp,edx
+ xor ebx,DWORD [28+esp]
+ rol ebx,1
+ add ebp,eax
+ ror edx,2
+ mov eax,ecx
+ rol eax,5
+ mov DWORD [40+esp],ebx
+ lea ebx,[2400959708+ebp*1+ebx]
+ mov ebp,edi
+ add ebx,eax
+ and ebp,esi
+ mov eax,DWORD [44+esp]
+ add ebx,ebp
+ ; 40_59 59
+ mov ebp,edx
+ xor eax,DWORD [52+esp]
+ xor ebp,edi
+ xor eax,DWORD [12+esp]
+ and ebp,ecx
+ xor eax,DWORD [32+esp]
+ rol eax,1
+ add ebp,esi
+ ror ecx,2
+ mov esi,ebx
+ rol esi,5
+ mov DWORD [44+esp],eax
+ lea eax,[2400959708+ebp*1+eax]
+ mov ebp,edx
+ add eax,esi
+ and ebp,edi
+ mov esi,DWORD [48+esp]
+ add eax,ebp
+ ; 20_39 60
+ mov ebp,ebx
+ xor esi,DWORD [56+esp]
+ xor ebp,ecx
+ xor esi,DWORD [16+esp]
+ xor ebp,edx
+ xor esi,DWORD [36+esp]
+ rol esi,1
+ add edi,ebp
+ ror ebx,2
+ mov ebp,eax
+ rol ebp,5
+ mov DWORD [48+esp],esi
+ lea esi,[3395469782+edi*1+esi]
+ mov edi,DWORD [52+esp]
+ add esi,ebp
+ ; 20_39 61
+ mov ebp,eax
+ xor edi,DWORD [60+esp]
+ xor ebp,ebx
+ xor edi,DWORD [20+esp]
+ xor ebp,ecx
+ xor edi,DWORD [40+esp]
+ rol edi,1
+ add edx,ebp
+ ror eax,2
+ mov ebp,esi
+ rol ebp,5
+ mov DWORD [52+esp],edi
+ lea edi,[3395469782+edx*1+edi]
+ mov edx,DWORD [56+esp]
+ add edi,ebp
+ ; 20_39 62
+ mov ebp,esi
+ xor edx,DWORD [esp]
+ xor ebp,eax
+ xor edx,DWORD [24+esp]
+ xor ebp,ebx
+ xor edx,DWORD [44+esp]
+ rol edx,1
+ add ecx,ebp
+ ror esi,2
+ mov ebp,edi
+ rol ebp,5
+ mov DWORD [56+esp],edx
+ lea edx,[3395469782+ecx*1+edx]
+ mov ecx,DWORD [60+esp]
+ add edx,ebp
+ ; 20_39 63
+ mov ebp,edi
+ xor ecx,DWORD [4+esp]
+ xor ebp,esi
+ xor ecx,DWORD [28+esp]
+ xor ebp,eax
+ xor ecx,DWORD [48+esp]
+ rol ecx,1
+ add ebx,ebp
+ ror edi,2
+ mov ebp,edx
+ rol ebp,5
+ mov DWORD [60+esp],ecx
+ lea ecx,[3395469782+ebx*1+ecx]
+ mov ebx,DWORD [esp]
+ add ecx,ebp
+ ; 20_39 64
+ mov ebp,edx
+ xor ebx,DWORD [8+esp]
+ xor ebp,edi
+ xor ebx,DWORD [32+esp]
+ xor ebp,esi
+ xor ebx,DWORD [52+esp]
+ rol ebx,1
+ add eax,ebp
+ ror edx,2
+ mov ebp,ecx
+ rol ebp,5
+ mov DWORD [esp],ebx
+ lea ebx,[3395469782+eax*1+ebx]
+ mov eax,DWORD [4+esp]
+ add ebx,ebp
+ ; 20_39 65
+ mov ebp,ecx
+ xor eax,DWORD [12+esp]
+ xor ebp,edx
+ xor eax,DWORD [36+esp]
+ xor ebp,edi
+ xor eax,DWORD [56+esp]
+ rol eax,1
+ add esi,ebp
+ ror ecx,2
+ mov ebp,ebx
+ rol ebp,5
+ mov DWORD [4+esp],eax
+ lea eax,[3395469782+esi*1+eax]
+ mov esi,DWORD [8+esp]
+ add eax,ebp
+ ; 20_39 66
+ mov ebp,ebx
+ xor esi,DWORD [16+esp]
+ xor ebp,ecx
+ xor esi,DWORD [40+esp]
+ xor ebp,edx
+ xor esi,DWORD [60+esp]
+ rol esi,1
+ add edi,ebp
+ ror ebx,2
+ mov ebp,eax
+ rol ebp,5
+ mov DWORD [8+esp],esi
+ lea esi,[3395469782+edi*1+esi]
+ mov edi,DWORD [12+esp]
+ add esi,ebp
+ ; 20_39 67
+ mov ebp,eax
+ xor edi,DWORD [20+esp]
+ xor ebp,ebx
+ xor edi,DWORD [44+esp]
+ xor ebp,ecx
+ xor edi,DWORD [esp]
+ rol edi,1
+ add edx,ebp
+ ror eax,2
+ mov ebp,esi
+ rol ebp,5
+ mov DWORD [12+esp],edi
+ lea edi,[3395469782+edx*1+edi]
+ mov edx,DWORD [16+esp]
+ add edi,ebp
+ ; 20_39 68
+ mov ebp,esi
+ xor edx,DWORD [24+esp]
+ xor ebp,eax
+ xor edx,DWORD [48+esp]
+ xor ebp,ebx
+ xor edx,DWORD [4+esp]
+ rol edx,1
+ add ecx,ebp
+ ror esi,2
+ mov ebp,edi
+ rol ebp,5
+ mov DWORD [16+esp],edx
+ lea edx,[3395469782+ecx*1+edx]
+ mov ecx,DWORD [20+esp]
+ add edx,ebp
+ ; 20_39 69
+ mov ebp,edi
+ xor ecx,DWORD [28+esp]
+ xor ebp,esi
+ xor ecx,DWORD [52+esp]
+ xor ebp,eax
+ xor ecx,DWORD [8+esp]
+ rol ecx,1
+ add ebx,ebp
+ ror edi,2
+ mov ebp,edx
+ rol ebp,5
+ mov DWORD [20+esp],ecx
+ lea ecx,[3395469782+ebx*1+ecx]
+ mov ebx,DWORD [24+esp]
+ add ecx,ebp
+ ; 20_39 70
+ mov ebp,edx
+ xor ebx,DWORD [32+esp]
+ xor ebp,edi
+ xor ebx,DWORD [56+esp]
+ xor ebp,esi
+ xor ebx,DWORD [12+esp]
+ rol ebx,1
+ add eax,ebp
+ ror edx,2
+ mov ebp,ecx
+ rol ebp,5
+ mov DWORD [24+esp],ebx
+ lea ebx,[3395469782+eax*1+ebx]
+ mov eax,DWORD [28+esp]
+ add ebx,ebp
+ ; 20_39 71
+ mov ebp,ecx
+ xor eax,DWORD [36+esp]
+ xor ebp,edx
+ xor eax,DWORD [60+esp]
+ xor ebp,edi
+ xor eax,DWORD [16+esp]
+ rol eax,1
+ add esi,ebp
+ ror ecx,2
+ mov ebp,ebx
+ rol ebp,5
+ mov DWORD [28+esp],eax
+ lea eax,[3395469782+esi*1+eax]
+ mov esi,DWORD [32+esp]
+ add eax,ebp
+ ; 20_39 72
+ mov ebp,ebx
+ xor esi,DWORD [40+esp]
+ xor ebp,ecx
+ xor esi,DWORD [esp]
+ xor ebp,edx
+ xor esi,DWORD [20+esp]
+ rol esi,1
+ add edi,ebp
+ ror ebx,2
+ mov ebp,eax
+ rol ebp,5
+ mov DWORD [32+esp],esi
+ lea esi,[3395469782+edi*1+esi]
+ mov edi,DWORD [36+esp]
+ add esi,ebp
+ ; 20_39 73
+ mov ebp,eax
+ xor edi,DWORD [44+esp]
+ xor ebp,ebx
+ xor edi,DWORD [4+esp]
+ xor ebp,ecx
+ xor edi,DWORD [24+esp]
+ rol edi,1
+ add edx,ebp
+ ror eax,2
+ mov ebp,esi
+ rol ebp,5
+ mov DWORD [36+esp],edi
+ lea edi,[3395469782+edx*1+edi]
+ mov edx,DWORD [40+esp]
+ add edi,ebp
+ ; 20_39 74
+ mov ebp,esi
+ xor edx,DWORD [48+esp]
+ xor ebp,eax
+ xor edx,DWORD [8+esp]
+ xor ebp,ebx
+ xor edx,DWORD [28+esp]
+ rol edx,1
+ add ecx,ebp
+ ror esi,2
+ mov ebp,edi
+ rol ebp,5
+ mov DWORD [40+esp],edx
+ lea edx,[3395469782+ecx*1+edx]
+ mov ecx,DWORD [44+esp]
+ add edx,ebp
+ ; 20_39 75
+ mov ebp,edi
+ xor ecx,DWORD [52+esp]
+ xor ebp,esi
+ xor ecx,DWORD [12+esp]
+ xor ebp,eax
+ xor ecx,DWORD [32+esp]
+ rol ecx,1
+ add ebx,ebp
+ ror edi,2
+ mov ebp,edx
+ rol ebp,5
+ mov DWORD [44+esp],ecx
+ lea ecx,[3395469782+ebx*1+ecx]
+ mov ebx,DWORD [48+esp]
+ add ecx,ebp
+ ; 20_39 76
+ mov ebp,edx
+ xor ebx,DWORD [56+esp]
+ xor ebp,edi
+ xor ebx,DWORD [16+esp]
+ xor ebp,esi
+ xor ebx,DWORD [36+esp]
+ rol ebx,1
+ add eax,ebp
+ ror edx,2
+ mov ebp,ecx
+ rol ebp,5
+ mov DWORD [48+esp],ebx
+ lea ebx,[3395469782+eax*1+ebx]
+ mov eax,DWORD [52+esp]
+ add ebx,ebp
+ ; 20_39 77
+ mov ebp,ecx
+ xor eax,DWORD [60+esp]
+ xor ebp,edx
+ xor eax,DWORD [20+esp]
+ xor ebp,edi
+ xor eax,DWORD [40+esp]
+ rol eax,1
+ add esi,ebp
+ ror ecx,2
+ mov ebp,ebx
+ rol ebp,5
+ lea eax,[3395469782+esi*1+eax]
+ mov esi,DWORD [56+esp]
+ add eax,ebp
+ ; 20_39 78
+ mov ebp,ebx
+ xor esi,DWORD [esp]
+ xor ebp,ecx
+ xor esi,DWORD [24+esp]
+ xor ebp,edx
+ xor esi,DWORD [44+esp]
+ rol esi,1
+ add edi,ebp
+ ror ebx,2
+ mov ebp,eax
+ rol ebp,5
+ lea esi,[3395469782+edi*1+esi]
+ mov edi,DWORD [60+esp]
+ add esi,ebp
+ ; 20_39 79
+ mov ebp,eax
+ xor edi,DWORD [4+esp]
+ xor ebp,ebx
+ xor edi,DWORD [28+esp]
+ xor ebp,ecx
+ xor edi,DWORD [48+esp]
+ rol edi,1
+ add edx,ebp
+ ror eax,2
+ mov ebp,esi
+ rol ebp,5
+ lea edi,[3395469782+edx*1+edi]
+ add edi,ebp
+ mov ebp,DWORD [96+esp]
+ mov edx,DWORD [100+esp]
+ add edi,DWORD [ebp]
+ add esi,DWORD [4+ebp]
+ add eax,DWORD [8+ebp]
+ add ebx,DWORD [12+ebp]
+ add ecx,DWORD [16+ebp]
+ mov DWORD [ebp],edi
+ add edx,64
+ mov DWORD [4+ebp],esi
+ cmp edx,DWORD [104+esp]
+ mov DWORD [8+ebp],eax
+ mov edi,ecx
+ mov DWORD [12+ebp],ebx
+ mov esi,edx
+ mov DWORD [16+ebp],ecx
+ jb NEAR L$000loop
+ add esp,76
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _sha1_block_data_order_ssse3
+align 16
+_sha1_block_data_order_ssse3:
+L$_sha1_block_data_order_ssse3_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ call L$001pic_point
+L$001pic_point:
+ pop ebp
+ lea ebp,[(L$K_XX_XX-L$001pic_point)+ebp]
+ movdqa xmm7,[ebp]
+ movdqa xmm0,[16+ebp]
+ movdqa xmm1,[32+ebp]
+ movdqa xmm2,[48+ebp]
+ movdqa xmm6,[64+ebp]
+ mov edi,DWORD [20+esp]
+ mov ebp,DWORD [24+esp]
+ mov edx,DWORD [28+esp]
+ mov esi,esp
+ sub esp,208
+ and esp,-64
+ movdqa [112+esp],xmm0
+ movdqa [128+esp],xmm1
+ movdqa [144+esp],xmm2
+ shl edx,6
+ movdqa [160+esp],xmm7
+ add edx,ebp
+ movdqa [176+esp],xmm6
+ add ebp,64
+ mov DWORD [192+esp],edi
+ mov DWORD [196+esp],ebp
+ mov DWORD [200+esp],edx
+ mov DWORD [204+esp],esi
+ mov eax,DWORD [edi]
+ mov ebx,DWORD [4+edi]
+ mov ecx,DWORD [8+edi]
+ mov edx,DWORD [12+edi]
+ mov edi,DWORD [16+edi]
+ mov esi,ebx
+ movdqu xmm0,[ebp-64]
+ movdqu xmm1,[ebp-48]
+ movdqu xmm2,[ebp-32]
+ movdqu xmm3,[ebp-16]
+db 102,15,56,0,198
+db 102,15,56,0,206
+db 102,15,56,0,214
+ movdqa [96+esp],xmm7
+db 102,15,56,0,222
+ paddd xmm0,xmm7
+ paddd xmm1,xmm7
+ paddd xmm2,xmm7
+ movdqa [esp],xmm0
+ psubd xmm0,xmm7
+ movdqa [16+esp],xmm1
+ psubd xmm1,xmm7
+ movdqa [32+esp],xmm2
+ mov ebp,ecx
+ psubd xmm2,xmm7
+ xor ebp,edx
+ pshufd xmm4,xmm0,238
+ and esi,ebp
+ jmp NEAR L$002loop
+align 16
+L$002loop:
+ ror ebx,2
+ xor esi,edx
+ mov ebp,eax
+ punpcklqdq xmm4,xmm1
+ movdqa xmm6,xmm3
+ add edi,DWORD [esp]
+ xor ebx,ecx
+ paddd xmm7,xmm3
+ movdqa [64+esp],xmm0
+ rol eax,5
+ add edi,esi
+ psrldq xmm6,4
+ and ebp,ebx
+ xor ebx,ecx
+ pxor xmm4,xmm0
+ add edi,eax
+ ror eax,7
+ pxor xmm6,xmm2
+ xor ebp,ecx
+ mov esi,edi
+ add edx,DWORD [4+esp]
+ pxor xmm4,xmm6
+ xor eax,ebx
+ rol edi,5
+ movdqa [48+esp],xmm7
+ add edx,ebp
+ and esi,eax
+ movdqa xmm0,xmm4
+ xor eax,ebx
+ add edx,edi
+ ror edi,7
+ movdqa xmm6,xmm4
+ xor esi,ebx
+ pslldq xmm0,12
+ paddd xmm4,xmm4
+ mov ebp,edx
+ add ecx,DWORD [8+esp]
+ psrld xmm6,31
+ xor edi,eax
+ rol edx,5
+ movdqa xmm7,xmm0
+ add ecx,esi
+ and ebp,edi
+ xor edi,eax
+ psrld xmm0,30
+ add ecx,edx
+ ror edx,7
+ por xmm4,xmm6
+ xor ebp,eax
+ mov esi,ecx
+ add ebx,DWORD [12+esp]
+ pslld xmm7,2
+ xor edx,edi
+ rol ecx,5
+ pxor xmm4,xmm0
+ movdqa xmm0,[96+esp]
+ add ebx,ebp
+ and esi,edx
+ pxor xmm4,xmm7
+ pshufd xmm5,xmm1,238
+ xor edx,edi
+ add ebx,ecx
+ ror ecx,7
+ xor esi,edi
+ mov ebp,ebx
+ punpcklqdq xmm5,xmm2
+ movdqa xmm7,xmm4
+ add eax,DWORD [16+esp]
+ xor ecx,edx
+ paddd xmm0,xmm4
+ movdqa [80+esp],xmm1
+ rol ebx,5
+ add eax,esi
+ psrldq xmm7,4
+ and ebp,ecx
+ xor ecx,edx
+ pxor xmm5,xmm1
+ add eax,ebx
+ ror ebx,7
+ pxor xmm7,xmm3
+ xor ebp,edx
+ mov esi,eax
+ add edi,DWORD [20+esp]
+ pxor xmm5,xmm7
+ xor ebx,ecx
+ rol eax,5
+ movdqa [esp],xmm0
+ add edi,ebp
+ and esi,ebx
+ movdqa xmm1,xmm5
+ xor ebx,ecx
+ add edi,eax
+ ror eax,7
+ movdqa xmm7,xmm5
+ xor esi,ecx
+ pslldq xmm1,12
+ paddd xmm5,xmm5
+ mov ebp,edi
+ add edx,DWORD [24+esp]
+ psrld xmm7,31
+ xor eax,ebx
+ rol edi,5
+ movdqa xmm0,xmm1
+ add edx,esi
+ and ebp,eax
+ xor eax,ebx
+ psrld xmm1,30
+ add edx,edi
+ ror edi,7
+ por xmm5,xmm7
+ xor ebp,ebx
+ mov esi,edx
+ add ecx,DWORD [28+esp]
+ pslld xmm0,2
+ xor edi,eax
+ rol edx,5
+ pxor xmm5,xmm1
+ movdqa xmm1,[112+esp]
+ add ecx,ebp
+ and esi,edi
+ pxor xmm5,xmm0
+ pshufd xmm6,xmm2,238
+ xor edi,eax
+ add ecx,edx
+ ror edx,7
+ xor esi,eax
+ mov ebp,ecx
+ punpcklqdq xmm6,xmm3
+ movdqa xmm0,xmm5
+ add ebx,DWORD [32+esp]
+ xor edx,edi
+ paddd xmm1,xmm5
+ movdqa [96+esp],xmm2
+ rol ecx,5
+ add ebx,esi
+ psrldq xmm0,4
+ and ebp,edx
+ xor edx,edi
+ pxor xmm6,xmm2
+ add ebx,ecx
+ ror ecx,7
+ pxor xmm0,xmm4
+ xor ebp,edi
+ mov esi,ebx
+ add eax,DWORD [36+esp]
+ pxor xmm6,xmm0
+ xor ecx,edx
+ rol ebx,5
+ movdqa [16+esp],xmm1
+ add eax,ebp
+ and esi,ecx
+ movdqa xmm2,xmm6
+ xor ecx,edx
+ add eax,ebx
+ ror ebx,7
+ movdqa xmm0,xmm6
+ xor esi,edx
+ pslldq xmm2,12
+ paddd xmm6,xmm6
+ mov ebp,eax
+ add edi,DWORD [40+esp]
+ psrld xmm0,31
+ xor ebx,ecx
+ rol eax,5
+ movdqa xmm1,xmm2
+ add edi,esi
+ and ebp,ebx
+ xor ebx,ecx
+ psrld xmm2,30
+ add edi,eax
+ ror eax,7
+ por xmm6,xmm0
+ xor ebp,ecx
+ movdqa xmm0,[64+esp]
+ mov esi,edi
+ add edx,DWORD [44+esp]
+ pslld xmm1,2
+ xor eax,ebx
+ rol edi,5
+ pxor xmm6,xmm2
+ movdqa xmm2,[112+esp]
+ add edx,ebp
+ and esi,eax
+ pxor xmm6,xmm1
+ pshufd xmm7,xmm3,238
+ xor eax,ebx
+ add edx,edi
+ ror edi,7
+ xor esi,ebx
+ mov ebp,edx
+ punpcklqdq xmm7,xmm4
+ movdqa xmm1,xmm6
+ add ecx,DWORD [48+esp]
+ xor edi,eax
+ paddd xmm2,xmm6
+ movdqa [64+esp],xmm3
+ rol edx,5
+ add ecx,esi
+ psrldq xmm1,4
+ and ebp,edi
+ xor edi,eax
+ pxor xmm7,xmm3
+ add ecx,edx
+ ror edx,7
+ pxor xmm1,xmm5
+ xor ebp,eax
+ mov esi,ecx
+ add ebx,DWORD [52+esp]
+ pxor xmm7,xmm1
+ xor edx,edi
+ rol ecx,5
+ movdqa [32+esp],xmm2
+ add ebx,ebp
+ and esi,edx
+ movdqa xmm3,xmm7
+ xor edx,edi
+ add ebx,ecx
+ ror ecx,7
+ movdqa xmm1,xmm7
+ xor esi,edi
+ pslldq xmm3,12
+ paddd xmm7,xmm7
+ mov ebp,ebx
+ add eax,DWORD [56+esp]
+ psrld xmm1,31
+ xor ecx,edx
+ rol ebx,5
+ movdqa xmm2,xmm3
+ add eax,esi
+ and ebp,ecx
+ xor ecx,edx
+ psrld xmm3,30
+ add eax,ebx
+ ror ebx,7
+ por xmm7,xmm1
+ xor ebp,edx
+ movdqa xmm1,[80+esp]
+ mov esi,eax
+ add edi,DWORD [60+esp]
+ pslld xmm2,2
+ xor ebx,ecx
+ rol eax,5
+ pxor xmm7,xmm3
+ movdqa xmm3,[112+esp]
+ add edi,ebp
+ and esi,ebx
+ pxor xmm7,xmm2
+ pshufd xmm2,xmm6,238
+ xor ebx,ecx
+ add edi,eax
+ ror eax,7
+ pxor xmm0,xmm4
+ punpcklqdq xmm2,xmm7
+ xor esi,ecx
+ mov ebp,edi
+ add edx,DWORD [esp]
+ pxor xmm0,xmm1
+ movdqa [80+esp],xmm4
+ xor eax,ebx
+ rol edi,5
+ movdqa xmm4,xmm3
+ add edx,esi
+ paddd xmm3,xmm7
+ and ebp,eax
+ pxor xmm0,xmm2
+ xor eax,ebx
+ add edx,edi
+ ror edi,7
+ xor ebp,ebx
+ movdqa xmm2,xmm0
+ movdqa [48+esp],xmm3
+ mov esi,edx
+ add ecx,DWORD [4+esp]
+ xor edi,eax
+ rol edx,5
+ pslld xmm0,2
+ add ecx,ebp
+ and esi,edi
+ psrld xmm2,30
+ xor edi,eax
+ add ecx,edx
+ ror edx,7
+ xor esi,eax
+ mov ebp,ecx
+ add ebx,DWORD [8+esp]
+ xor edx,edi
+ rol ecx,5
+ por xmm0,xmm2
+ add ebx,esi
+ and ebp,edx
+ movdqa xmm2,[96+esp]
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [12+esp]
+ xor ebp,edi
+ mov esi,ebx
+ pshufd xmm3,xmm7,238
+ rol ebx,5
+ add eax,ebp
+ xor esi,edx
+ ror ecx,7
+ add eax,ebx
+ add edi,DWORD [16+esp]
+ pxor xmm1,xmm5
+ punpcklqdq xmm3,xmm0
+ xor esi,ecx
+ mov ebp,eax
+ rol eax,5
+ pxor xmm1,xmm2
+ movdqa [96+esp],xmm5
+ add edi,esi
+ xor ebp,ecx
+ movdqa xmm5,xmm4
+ ror ebx,7
+ paddd xmm4,xmm0
+ add edi,eax
+ pxor xmm1,xmm3
+ add edx,DWORD [20+esp]
+ xor ebp,ebx
+ mov esi,edi
+ rol edi,5
+ movdqa xmm3,xmm1
+ movdqa [esp],xmm4
+ add edx,ebp
+ xor esi,ebx
+ ror eax,7
+ add edx,edi
+ pslld xmm1,2
+ add ecx,DWORD [24+esp]
+ xor esi,eax
+ psrld xmm3,30
+ mov ebp,edx
+ rol edx,5
+ add ecx,esi
+ xor ebp,eax
+ ror edi,7
+ add ecx,edx
+ por xmm1,xmm3
+ add ebx,DWORD [28+esp]
+ xor ebp,edi
+ movdqa xmm3,[64+esp]
+ mov esi,ecx
+ rol ecx,5
+ add ebx,ebp
+ xor esi,edi
+ ror edx,7
+ pshufd xmm4,xmm0,238
+ add ebx,ecx
+ add eax,DWORD [32+esp]
+ pxor xmm2,xmm6
+ punpcklqdq xmm4,xmm1
+ xor esi,edx
+ mov ebp,ebx
+ rol ebx,5
+ pxor xmm2,xmm3
+ movdqa [64+esp],xmm6
+ add eax,esi
+ xor ebp,edx
+ movdqa xmm6,[128+esp]
+ ror ecx,7
+ paddd xmm5,xmm1
+ add eax,ebx
+ pxor xmm2,xmm4
+ add edi,DWORD [36+esp]
+ xor ebp,ecx
+ mov esi,eax
+ rol eax,5
+ movdqa xmm4,xmm2
+ movdqa [16+esp],xmm5
+ add edi,ebp
+ xor esi,ecx
+ ror ebx,7
+ add edi,eax
+ pslld xmm2,2
+ add edx,DWORD [40+esp]
+ xor esi,ebx
+ psrld xmm4,30
+ mov ebp,edi
+ rol edi,5
+ add edx,esi
+ xor ebp,ebx
+ ror eax,7
+ add edx,edi
+ por xmm2,xmm4
+ add ecx,DWORD [44+esp]
+ xor ebp,eax
+ movdqa xmm4,[80+esp]
+ mov esi,edx
+ rol edx,5
+ add ecx,ebp
+ xor esi,eax
+ ror edi,7
+ pshufd xmm5,xmm1,238
+ add ecx,edx
+ add ebx,DWORD [48+esp]
+ pxor xmm3,xmm7
+ punpcklqdq xmm5,xmm2
+ xor esi,edi
+ mov ebp,ecx
+ rol ecx,5
+ pxor xmm3,xmm4
+ movdqa [80+esp],xmm7
+ add ebx,esi
+ xor ebp,edi
+ movdqa xmm7,xmm6
+ ror edx,7
+ paddd xmm6,xmm2
+ add ebx,ecx
+ pxor xmm3,xmm5
+ add eax,DWORD [52+esp]
+ xor ebp,edx
+ mov esi,ebx
+ rol ebx,5
+ movdqa xmm5,xmm3
+ movdqa [32+esp],xmm6
+ add eax,ebp
+ xor esi,edx
+ ror ecx,7
+ add eax,ebx
+ pslld xmm3,2
+ add edi,DWORD [56+esp]
+ xor esi,ecx
+ psrld xmm5,30
+ mov ebp,eax
+ rol eax,5
+ add edi,esi
+ xor ebp,ecx
+ ror ebx,7
+ add edi,eax
+ por xmm3,xmm5
+ add edx,DWORD [60+esp]
+ xor ebp,ebx
+ movdqa xmm5,[96+esp]
+ mov esi,edi
+ rol edi,5
+ add edx,ebp
+ xor esi,ebx
+ ror eax,7
+ pshufd xmm6,xmm2,238
+ add edx,edi
+ add ecx,DWORD [esp]
+ pxor xmm4,xmm0
+ punpcklqdq xmm6,xmm3
+ xor esi,eax
+ mov ebp,edx
+ rol edx,5
+ pxor xmm4,xmm5
+ movdqa [96+esp],xmm0
+ add ecx,esi
+ xor ebp,eax
+ movdqa xmm0,xmm7
+ ror edi,7
+ paddd xmm7,xmm3
+ add ecx,edx
+ pxor xmm4,xmm6
+ add ebx,DWORD [4+esp]
+ xor ebp,edi
+ mov esi,ecx
+ rol ecx,5
+ movdqa xmm6,xmm4
+ movdqa [48+esp],xmm7
+ add ebx,ebp
+ xor esi,edi
+ ror edx,7
+ add ebx,ecx
+ pslld xmm4,2
+ add eax,DWORD [8+esp]
+ xor esi,edx
+ psrld xmm6,30
+ mov ebp,ebx
+ rol ebx,5
+ add eax,esi
+ xor ebp,edx
+ ror ecx,7
+ add eax,ebx
+ por xmm4,xmm6
+ add edi,DWORD [12+esp]
+ xor ebp,ecx
+ movdqa xmm6,[64+esp]
+ mov esi,eax
+ rol eax,5
+ add edi,ebp
+ xor esi,ecx
+ ror ebx,7
+ pshufd xmm7,xmm3,238
+ add edi,eax
+ add edx,DWORD [16+esp]
+ pxor xmm5,xmm1
+ punpcklqdq xmm7,xmm4
+ xor esi,ebx
+ mov ebp,edi
+ rol edi,5
+ pxor xmm5,xmm6
+ movdqa [64+esp],xmm1
+ add edx,esi
+ xor ebp,ebx
+ movdqa xmm1,xmm0
+ ror eax,7
+ paddd xmm0,xmm4
+ add edx,edi
+ pxor xmm5,xmm7
+ add ecx,DWORD [20+esp]
+ xor ebp,eax
+ mov esi,edx
+ rol edx,5
+ movdqa xmm7,xmm5
+ movdqa [esp],xmm0
+ add ecx,ebp
+ xor esi,eax
+ ror edi,7
+ add ecx,edx
+ pslld xmm5,2
+ add ebx,DWORD [24+esp]
+ xor esi,edi
+ psrld xmm7,30
+ mov ebp,ecx
+ rol ecx,5
+ add ebx,esi
+ xor ebp,edi
+ ror edx,7
+ add ebx,ecx
+ por xmm5,xmm7
+ add eax,DWORD [28+esp]
+ movdqa xmm7,[80+esp]
+ ror ecx,7
+ mov esi,ebx
+ xor ebp,edx
+ rol ebx,5
+ pshufd xmm0,xmm4,238
+ add eax,ebp
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add edi,DWORD [32+esp]
+ pxor xmm6,xmm2
+ punpcklqdq xmm0,xmm5
+ and esi,ecx
+ xor ecx,edx
+ ror ebx,7
+ pxor xmm6,xmm7
+ movdqa [80+esp],xmm2
+ mov ebp,eax
+ xor esi,ecx
+ rol eax,5
+ movdqa xmm2,xmm1
+ add edi,esi
+ paddd xmm1,xmm5
+ xor ebp,ebx
+ pxor xmm6,xmm0
+ xor ebx,ecx
+ add edi,eax
+ add edx,DWORD [36+esp]
+ and ebp,ebx
+ movdqa xmm0,xmm6
+ movdqa [16+esp],xmm1
+ xor ebx,ecx
+ ror eax,7
+ mov esi,edi
+ xor ebp,ebx
+ rol edi,5
+ pslld xmm6,2
+ add edx,ebp
+ xor esi,eax
+ psrld xmm0,30
+ xor eax,ebx
+ add edx,edi
+ add ecx,DWORD [40+esp]
+ and esi,eax
+ xor eax,ebx
+ ror edi,7
+ por xmm6,xmm0
+ mov ebp,edx
+ xor esi,eax
+ movdqa xmm0,[96+esp]
+ rol edx,5
+ add ecx,esi
+ xor ebp,edi
+ xor edi,eax
+ add ecx,edx
+ pshufd xmm1,xmm5,238
+ add ebx,DWORD [44+esp]
+ and ebp,edi
+ xor edi,eax
+ ror edx,7
+ mov esi,ecx
+ xor ebp,edi
+ rol ecx,5
+ add ebx,ebp
+ xor esi,edx
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [48+esp]
+ pxor xmm7,xmm3
+ punpcklqdq xmm1,xmm6
+ and esi,edx
+ xor edx,edi
+ ror ecx,7
+ pxor xmm7,xmm0
+ movdqa [96+esp],xmm3
+ mov ebp,ebx
+ xor esi,edx
+ rol ebx,5
+ movdqa xmm3,[144+esp]
+ add eax,esi
+ paddd xmm2,xmm6
+ xor ebp,ecx
+ pxor xmm7,xmm1
+ xor ecx,edx
+ add eax,ebx
+ add edi,DWORD [52+esp]
+ and ebp,ecx
+ movdqa xmm1,xmm7
+ movdqa [32+esp],xmm2
+ xor ecx,edx
+ ror ebx,7
+ mov esi,eax
+ xor ebp,ecx
+ rol eax,5
+ pslld xmm7,2
+ add edi,ebp
+ xor esi,ebx
+ psrld xmm1,30
+ xor ebx,ecx
+ add edi,eax
+ add edx,DWORD [56+esp]
+ and esi,ebx
+ xor ebx,ecx
+ ror eax,7
+ por xmm7,xmm1
+ mov ebp,edi
+ xor esi,ebx
+ movdqa xmm1,[64+esp]
+ rol edi,5
+ add edx,esi
+ xor ebp,eax
+ xor eax,ebx
+ add edx,edi
+ pshufd xmm2,xmm6,238
+ add ecx,DWORD [60+esp]
+ and ebp,eax
+ xor eax,ebx
+ ror edi,7
+ mov esi,edx
+ xor ebp,eax
+ rol edx,5
+ add ecx,ebp
+ xor esi,edi
+ xor edi,eax
+ add ecx,edx
+ add ebx,DWORD [esp]
+ pxor xmm0,xmm4
+ punpcklqdq xmm2,xmm7
+ and esi,edi
+ xor edi,eax
+ ror edx,7
+ pxor xmm0,xmm1
+ movdqa [64+esp],xmm4
+ mov ebp,ecx
+ xor esi,edi
+ rol ecx,5
+ movdqa xmm4,xmm3
+ add ebx,esi
+ paddd xmm3,xmm7
+ xor ebp,edx
+ pxor xmm0,xmm2
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [4+esp]
+ and ebp,edx
+ movdqa xmm2,xmm0
+ movdqa [48+esp],xmm3
+ xor edx,edi
+ ror ecx,7
+ mov esi,ebx
+ xor ebp,edx
+ rol ebx,5
+ pslld xmm0,2
+ add eax,ebp
+ xor esi,ecx
+ psrld xmm2,30
+ xor ecx,edx
+ add eax,ebx
+ add edi,DWORD [8+esp]
+ and esi,ecx
+ xor ecx,edx
+ ror ebx,7
+ por xmm0,xmm2
+ mov ebp,eax
+ xor esi,ecx
+ movdqa xmm2,[80+esp]
+ rol eax,5
+ add edi,esi
+ xor ebp,ebx
+ xor ebx,ecx
+ add edi,eax
+ pshufd xmm3,xmm7,238
+ add edx,DWORD [12+esp]
+ and ebp,ebx
+ xor ebx,ecx
+ ror eax,7
+ mov esi,edi
+ xor ebp,ebx
+ rol edi,5
+ add edx,ebp
+ xor esi,eax
+ xor eax,ebx
+ add edx,edi
+ add ecx,DWORD [16+esp]
+ pxor xmm1,xmm5
+ punpcklqdq xmm3,xmm0
+ and esi,eax
+ xor eax,ebx
+ ror edi,7
+ pxor xmm1,xmm2
+ movdqa [80+esp],xmm5
+ mov ebp,edx
+ xor esi,eax
+ rol edx,5
+ movdqa xmm5,xmm4
+ add ecx,esi
+ paddd xmm4,xmm0
+ xor ebp,edi
+ pxor xmm1,xmm3
+ xor edi,eax
+ add ecx,edx
+ add ebx,DWORD [20+esp]
+ and ebp,edi
+ movdqa xmm3,xmm1
+ movdqa [esp],xmm4
+ xor edi,eax
+ ror edx,7
+ mov esi,ecx
+ xor ebp,edi
+ rol ecx,5
+ pslld xmm1,2
+ add ebx,ebp
+ xor esi,edx
+ psrld xmm3,30
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [24+esp]
+ and esi,edx
+ xor edx,edi
+ ror ecx,7
+ por xmm1,xmm3
+ mov ebp,ebx
+ xor esi,edx
+ movdqa xmm3,[96+esp]
+ rol ebx,5
+ add eax,esi
+ xor ebp,ecx
+ xor ecx,edx
+ add eax,ebx
+ pshufd xmm4,xmm0,238
+ add edi,DWORD [28+esp]
+ and ebp,ecx
+ xor ecx,edx
+ ror ebx,7
+ mov esi,eax
+ xor ebp,ecx
+ rol eax,5
+ add edi,ebp
+ xor esi,ebx
+ xor ebx,ecx
+ add edi,eax
+ add edx,DWORD [32+esp]
+ pxor xmm2,xmm6
+ punpcklqdq xmm4,xmm1
+ and esi,ebx
+ xor ebx,ecx
+ ror eax,7
+ pxor xmm2,xmm3
+ movdqa [96+esp],xmm6
+ mov ebp,edi
+ xor esi,ebx
+ rol edi,5
+ movdqa xmm6,xmm5
+ add edx,esi
+ paddd xmm5,xmm1
+ xor ebp,eax
+ pxor xmm2,xmm4
+ xor eax,ebx
+ add edx,edi
+ add ecx,DWORD [36+esp]
+ and ebp,eax
+ movdqa xmm4,xmm2
+ movdqa [16+esp],xmm5
+ xor eax,ebx
+ ror edi,7
+ mov esi,edx
+ xor ebp,eax
+ rol edx,5
+ pslld xmm2,2
+ add ecx,ebp
+ xor esi,edi
+ psrld xmm4,30
+ xor edi,eax
+ add ecx,edx
+ add ebx,DWORD [40+esp]
+ and esi,edi
+ xor edi,eax
+ ror edx,7
+ por xmm2,xmm4
+ mov ebp,ecx
+ xor esi,edi
+ movdqa xmm4,[64+esp]
+ rol ecx,5
+ add ebx,esi
+ xor ebp,edx
+ xor edx,edi
+ add ebx,ecx
+ pshufd xmm5,xmm1,238
+ add eax,DWORD [44+esp]
+ and ebp,edx
+ xor edx,edi
+ ror ecx,7
+ mov esi,ebx
+ xor ebp,edx
+ rol ebx,5
+ add eax,ebp
+ xor esi,edx
+ add eax,ebx
+ add edi,DWORD [48+esp]
+ pxor xmm3,xmm7
+ punpcklqdq xmm5,xmm2
+ xor esi,ecx
+ mov ebp,eax
+ rol eax,5
+ pxor xmm3,xmm4
+ movdqa [64+esp],xmm7
+ add edi,esi
+ xor ebp,ecx
+ movdqa xmm7,xmm6
+ ror ebx,7
+ paddd xmm6,xmm2
+ add edi,eax
+ pxor xmm3,xmm5
+ add edx,DWORD [52+esp]
+ xor ebp,ebx
+ mov esi,edi
+ rol edi,5
+ movdqa xmm5,xmm3
+ movdqa [32+esp],xmm6
+ add edx,ebp
+ xor esi,ebx
+ ror eax,7
+ add edx,edi
+ pslld xmm3,2
+ add ecx,DWORD [56+esp]
+ xor esi,eax
+ psrld xmm5,30
+ mov ebp,edx
+ rol edx,5
+ add ecx,esi
+ xor ebp,eax
+ ror edi,7
+ add ecx,edx
+ por xmm3,xmm5
+ add ebx,DWORD [60+esp]
+ xor ebp,edi
+ mov esi,ecx
+ rol ecx,5
+ add ebx,ebp
+ xor esi,edi
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD [esp]
+ xor esi,edx
+ mov ebp,ebx
+ rol ebx,5
+ add eax,esi
+ xor ebp,edx
+ ror ecx,7
+ paddd xmm7,xmm3
+ add eax,ebx
+ add edi,DWORD [4+esp]
+ xor ebp,ecx
+ mov esi,eax
+ movdqa [48+esp],xmm7
+ rol eax,5
+ add edi,ebp
+ xor esi,ecx
+ ror ebx,7
+ add edi,eax
+ add edx,DWORD [8+esp]
+ xor esi,ebx
+ mov ebp,edi
+ rol edi,5
+ add edx,esi
+ xor ebp,ebx
+ ror eax,7
+ add edx,edi
+ add ecx,DWORD [12+esp]
+ xor ebp,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,ebp
+ xor esi,eax
+ ror edi,7
+ add ecx,edx
+ mov ebp,DWORD [196+esp]
+ cmp ebp,DWORD [200+esp]
+ je NEAR L$003done
+ movdqa xmm7,[160+esp]
+ movdqa xmm6,[176+esp]
+ movdqu xmm0,[ebp]
+ movdqu xmm1,[16+ebp]
+ movdqu xmm2,[32+ebp]
+ movdqu xmm3,[48+ebp]
+ add ebp,64
+db 102,15,56,0,198
+ mov DWORD [196+esp],ebp
+ movdqa [96+esp],xmm7
+ add ebx,DWORD [16+esp]
+ xor esi,edi
+ mov ebp,ecx
+ rol ecx,5
+ add ebx,esi
+ xor ebp,edi
+ ror edx,7
+db 102,15,56,0,206
+ add ebx,ecx
+ add eax,DWORD [20+esp]
+ xor ebp,edx
+ mov esi,ebx
+ paddd xmm0,xmm7
+ rol ebx,5
+ add eax,ebp
+ xor esi,edx
+ ror ecx,7
+ movdqa [esp],xmm0
+ add eax,ebx
+ add edi,DWORD [24+esp]
+ xor esi,ecx
+ mov ebp,eax
+ psubd xmm0,xmm7
+ rol eax,5
+ add edi,esi
+ xor ebp,ecx
+ ror ebx,7
+ add edi,eax
+ add edx,DWORD [28+esp]
+ xor ebp,ebx
+ mov esi,edi
+ rol edi,5
+ add edx,ebp
+ xor esi,ebx
+ ror eax,7
+ add edx,edi
+ add ecx,DWORD [32+esp]
+ xor esi,eax
+ mov ebp,edx
+ rol edx,5
+ add ecx,esi
+ xor ebp,eax
+ ror edi,7
+db 102,15,56,0,214
+ add ecx,edx
+ add ebx,DWORD [36+esp]
+ xor ebp,edi
+ mov esi,ecx
+ paddd xmm1,xmm7
+ rol ecx,5
+ add ebx,ebp
+ xor esi,edi
+ ror edx,7
+ movdqa [16+esp],xmm1
+ add ebx,ecx
+ add eax,DWORD [40+esp]
+ xor esi,edx
+ mov ebp,ebx
+ psubd xmm1,xmm7
+ rol ebx,5
+ add eax,esi
+ xor ebp,edx
+ ror ecx,7
+ add eax,ebx
+ add edi,DWORD [44+esp]
+ xor ebp,ecx
+ mov esi,eax
+ rol eax,5
+ add edi,ebp
+ xor esi,ecx
+ ror ebx,7
+ add edi,eax
+ add edx,DWORD [48+esp]
+ xor esi,ebx
+ mov ebp,edi
+ rol edi,5
+ add edx,esi
+ xor ebp,ebx
+ ror eax,7
+db 102,15,56,0,222
+ add edx,edi
+ add ecx,DWORD [52+esp]
+ xor ebp,eax
+ mov esi,edx
+ paddd xmm2,xmm7
+ rol edx,5
+ add ecx,ebp
+ xor esi,eax
+ ror edi,7
+ movdqa [32+esp],xmm2
+ add ecx,edx
+ add ebx,DWORD [56+esp]
+ xor esi,edi
+ mov ebp,ecx
+ psubd xmm2,xmm7
+ rol ecx,5
+ add ebx,esi
+ xor ebp,edi
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD [60+esp]
+ xor ebp,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,ebp
+ ror ecx,7
+ add eax,ebx
+ mov ebp,DWORD [192+esp]
+ add eax,DWORD [ebp]
+ add esi,DWORD [4+ebp]
+ add ecx,DWORD [8+ebp]
+ mov DWORD [ebp],eax
+ add edx,DWORD [12+ebp]
+ mov DWORD [4+ebp],esi
+ add edi,DWORD [16+ebp]
+ mov DWORD [8+ebp],ecx
+ mov ebx,ecx
+ mov DWORD [12+ebp],edx
+ xor ebx,edx
+ mov DWORD [16+ebp],edi
+ mov ebp,esi
+ pshufd xmm4,xmm0,238
+ and esi,ebx
+ mov ebx,ebp
+ jmp NEAR L$002loop
+align 16
+L$003done:
+ add ebx,DWORD [16+esp]
+ xor esi,edi
+ mov ebp,ecx
+ rol ecx,5
+ add ebx,esi
+ xor ebp,edi
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD [20+esp]
+ xor ebp,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,ebp
+ xor esi,edx
+ ror ecx,7
+ add eax,ebx
+ add edi,DWORD [24+esp]
+ xor esi,ecx
+ mov ebp,eax
+ rol eax,5
+ add edi,esi
+ xor ebp,ecx
+ ror ebx,7
+ add edi,eax
+ add edx,DWORD [28+esp]
+ xor ebp,ebx
+ mov esi,edi
+ rol edi,5
+ add edx,ebp
+ xor esi,ebx
+ ror eax,7
+ add edx,edi
+ add ecx,DWORD [32+esp]
+ xor esi,eax
+ mov ebp,edx
+ rol edx,5
+ add ecx,esi
+ xor ebp,eax
+ ror edi,7
+ add ecx,edx
+ add ebx,DWORD [36+esp]
+ xor ebp,edi
+ mov esi,ecx
+ rol ecx,5
+ add ebx,ebp
+ xor esi,edi
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD [40+esp]
+ xor esi,edx
+ mov ebp,ebx
+ rol ebx,5
+ add eax,esi
+ xor ebp,edx
+ ror ecx,7
+ add eax,ebx
+ add edi,DWORD [44+esp]
+ xor ebp,ecx
+ mov esi,eax
+ rol eax,5
+ add edi,ebp
+ xor esi,ecx
+ ror ebx,7
+ add edi,eax
+ add edx,DWORD [48+esp]
+ xor esi,ebx
+ mov ebp,edi
+ rol edi,5
+ add edx,esi
+ xor ebp,ebx
+ ror eax,7
+ add edx,edi
+ add ecx,DWORD [52+esp]
+ xor ebp,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,ebp
+ xor esi,eax
+ ror edi,7
+ add ecx,edx
+ add ebx,DWORD [56+esp]
+ xor esi,edi
+ mov ebp,ecx
+ rol ecx,5
+ add ebx,esi
+ xor ebp,edi
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD [60+esp]
+ xor ebp,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,ebp
+ ror ecx,7
+ add eax,ebx
+ mov ebp,DWORD [192+esp]
+ add eax,DWORD [ebp]
+ mov esp,DWORD [204+esp]
+ add esi,DWORD [4+ebp]
+ add ecx,DWORD [8+ebp]
+ mov DWORD [ebp],eax
+ add edx,DWORD [12+ebp]
+ mov DWORD [4+ebp],esi
+ add edi,DWORD [16+ebp]
+ mov DWORD [8+ebp],ecx
+ mov DWORD [12+ebp],edx
+ mov DWORD [16+ebp],edi
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _sha1_block_data_order_avx
+align 16
+_sha1_block_data_order_avx:
+L$_sha1_block_data_order_avx_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ call L$004pic_point
+L$004pic_point:
+ pop ebp
+ lea ebp,[(L$K_XX_XX-L$004pic_point)+ebp]
+ vzeroall
+ vmovdqa xmm7,[ebp]
+ vmovdqa xmm0,[16+ebp]
+ vmovdqa xmm1,[32+ebp]
+ vmovdqa xmm2,[48+ebp]
+ vmovdqa xmm6,[64+ebp]
+ mov edi,DWORD [20+esp]
+ mov ebp,DWORD [24+esp]
+ mov edx,DWORD [28+esp]
+ mov esi,esp
+ sub esp,208
+ and esp,-64
+ vmovdqa [112+esp],xmm0
+ vmovdqa [128+esp],xmm1
+ vmovdqa [144+esp],xmm2
+ shl edx,6
+ vmovdqa [160+esp],xmm7
+ add edx,ebp
+ vmovdqa [176+esp],xmm6
+ add ebp,64
+ mov DWORD [192+esp],edi
+ mov DWORD [196+esp],ebp
+ mov DWORD [200+esp],edx
+ mov DWORD [204+esp],esi
+ mov eax,DWORD [edi]
+ mov ebx,DWORD [4+edi]
+ mov ecx,DWORD [8+edi]
+ mov edx,DWORD [12+edi]
+ mov edi,DWORD [16+edi]
+ mov esi,ebx
+ vmovdqu xmm0,[ebp-64]
+ vmovdqu xmm1,[ebp-48]
+ vmovdqu xmm2,[ebp-32]
+ vmovdqu xmm3,[ebp-16]
+ vpshufb xmm0,xmm0,xmm6
+ vpshufb xmm1,xmm1,xmm6
+ vpshufb xmm2,xmm2,xmm6
+ vmovdqa [96+esp],xmm7
+ vpshufb xmm3,xmm3,xmm6
+ vpaddd xmm4,xmm0,xmm7
+ vpaddd xmm5,xmm1,xmm7
+ vpaddd xmm6,xmm2,xmm7
+ vmovdqa [esp],xmm4
+ mov ebp,ecx
+ vmovdqa [16+esp],xmm5
+ xor ebp,edx
+ vmovdqa [32+esp],xmm6
+ and esi,ebp
+ jmp NEAR L$005loop
+align 16
+L$005loop:
+ shrd ebx,ebx,2
+ xor esi,edx
+ vpalignr xmm4,xmm1,xmm0,8
+ mov ebp,eax
+ add edi,DWORD [esp]
+ vpaddd xmm7,xmm7,xmm3
+ vmovdqa [64+esp],xmm0
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrldq xmm6,xmm3,4
+ add edi,esi
+ and ebp,ebx
+ vpxor xmm4,xmm4,xmm0
+ xor ebx,ecx
+ add edi,eax
+ vpxor xmm6,xmm6,xmm2
+ shrd eax,eax,7
+ xor ebp,ecx
+ vmovdqa [48+esp],xmm7
+ mov esi,edi
+ add edx,DWORD [4+esp]
+ vpxor xmm4,xmm4,xmm6
+ xor eax,ebx
+ shld edi,edi,5
+ add edx,ebp
+ and esi,eax
+ vpsrld xmm6,xmm4,31
+ xor eax,ebx
+ add edx,edi
+ shrd edi,edi,7
+ xor esi,ebx
+ vpslldq xmm0,xmm4,12
+ vpaddd xmm4,xmm4,xmm4
+ mov ebp,edx
+ add ecx,DWORD [8+esp]
+ xor edi,eax
+ shld edx,edx,5
+ vpsrld xmm7,xmm0,30
+ vpor xmm4,xmm4,xmm6
+ add ecx,esi
+ and ebp,edi
+ xor edi,eax
+ add ecx,edx
+ vpslld xmm0,xmm0,2
+ shrd edx,edx,7
+ xor ebp,eax
+ vpxor xmm4,xmm4,xmm7
+ mov esi,ecx
+ add ebx,DWORD [12+esp]
+ xor edx,edi
+ shld ecx,ecx,5
+ vpxor xmm4,xmm4,xmm0
+ add ebx,ebp
+ and esi,edx
+ vmovdqa xmm0,[96+esp]
+ xor edx,edi
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,edi
+ vpalignr xmm5,xmm2,xmm1,8
+ mov ebp,ebx
+ add eax,DWORD [16+esp]
+ vpaddd xmm0,xmm0,xmm4
+ vmovdqa [80+esp],xmm1
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrldq xmm7,xmm4,4
+ add eax,esi
+ and ebp,ecx
+ vpxor xmm5,xmm5,xmm1
+ xor ecx,edx
+ add eax,ebx
+ vpxor xmm7,xmm7,xmm3
+ shrd ebx,ebx,7
+ xor ebp,edx
+ vmovdqa [esp],xmm0
+ mov esi,eax
+ add edi,DWORD [20+esp]
+ vpxor xmm5,xmm5,xmm7
+ xor ebx,ecx
+ shld eax,eax,5
+ add edi,ebp
+ and esi,ebx
+ vpsrld xmm7,xmm5,31
+ xor ebx,ecx
+ add edi,eax
+ shrd eax,eax,7
+ xor esi,ecx
+ vpslldq xmm1,xmm5,12
+ vpaddd xmm5,xmm5,xmm5
+ mov ebp,edi
+ add edx,DWORD [24+esp]
+ xor eax,ebx
+ shld edi,edi,5
+ vpsrld xmm0,xmm1,30
+ vpor xmm5,xmm5,xmm7
+ add edx,esi
+ and ebp,eax
+ xor eax,ebx
+ add edx,edi
+ vpslld xmm1,xmm1,2
+ shrd edi,edi,7
+ xor ebp,ebx
+ vpxor xmm5,xmm5,xmm0
+ mov esi,edx
+ add ecx,DWORD [28+esp]
+ xor edi,eax
+ shld edx,edx,5
+ vpxor xmm5,xmm5,xmm1
+ add ecx,ebp
+ and esi,edi
+ vmovdqa xmm1,[112+esp]
+ xor edi,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ vpalignr xmm6,xmm3,xmm2,8
+ mov ebp,ecx
+ add ebx,DWORD [32+esp]
+ vpaddd xmm1,xmm1,xmm5
+ vmovdqa [96+esp],xmm2
+ xor edx,edi
+ shld ecx,ecx,5
+ vpsrldq xmm0,xmm5,4
+ add ebx,esi
+ and ebp,edx
+ vpxor xmm6,xmm6,xmm2
+ xor edx,edi
+ add ebx,ecx
+ vpxor xmm0,xmm0,xmm4
+ shrd ecx,ecx,7
+ xor ebp,edi
+ vmovdqa [16+esp],xmm1
+ mov esi,ebx
+ add eax,DWORD [36+esp]
+ vpxor xmm6,xmm6,xmm0
+ xor ecx,edx
+ shld ebx,ebx,5
+ add eax,ebp
+ and esi,ecx
+ vpsrld xmm0,xmm6,31
+ xor ecx,edx
+ add eax,ebx
+ shrd ebx,ebx,7
+ xor esi,edx
+ vpslldq xmm2,xmm6,12
+ vpaddd xmm6,xmm6,xmm6
+ mov ebp,eax
+ add edi,DWORD [40+esp]
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrld xmm1,xmm2,30
+ vpor xmm6,xmm6,xmm0
+ add edi,esi
+ and ebp,ebx
+ xor ebx,ecx
+ add edi,eax
+ vpslld xmm2,xmm2,2
+ vmovdqa xmm0,[64+esp]
+ shrd eax,eax,7
+ xor ebp,ecx
+ vpxor xmm6,xmm6,xmm1
+ mov esi,edi
+ add edx,DWORD [44+esp]
+ xor eax,ebx
+ shld edi,edi,5
+ vpxor xmm6,xmm6,xmm2
+ add edx,ebp
+ and esi,eax
+ vmovdqa xmm2,[112+esp]
+ xor eax,ebx
+ add edx,edi
+ shrd edi,edi,7
+ xor esi,ebx
+ vpalignr xmm7,xmm4,xmm3,8
+ mov ebp,edx
+ add ecx,DWORD [48+esp]
+ vpaddd xmm2,xmm2,xmm6
+ vmovdqa [64+esp],xmm3
+ xor edi,eax
+ shld edx,edx,5
+ vpsrldq xmm1,xmm6,4
+ add ecx,esi
+ and ebp,edi
+ vpxor xmm7,xmm7,xmm3
+ xor edi,eax
+ add ecx,edx
+ vpxor xmm1,xmm1,xmm5
+ shrd edx,edx,7
+ xor ebp,eax
+ vmovdqa [32+esp],xmm2
+ mov esi,ecx
+ add ebx,DWORD [52+esp]
+ vpxor xmm7,xmm7,xmm1
+ xor edx,edi
+ shld ecx,ecx,5
+ add ebx,ebp
+ and esi,edx
+ vpsrld xmm1,xmm7,31
+ xor edx,edi
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,edi
+ vpslldq xmm3,xmm7,12
+ vpaddd xmm7,xmm7,xmm7
+ mov ebp,ebx
+ add eax,DWORD [56+esp]
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrld xmm2,xmm3,30
+ vpor xmm7,xmm7,xmm1
+ add eax,esi
+ and ebp,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpslld xmm3,xmm3,2
+ vmovdqa xmm1,[80+esp]
+ shrd ebx,ebx,7
+ xor ebp,edx
+ vpxor xmm7,xmm7,xmm2
+ mov esi,eax
+ add edi,DWORD [60+esp]
+ xor ebx,ecx
+ shld eax,eax,5
+ vpxor xmm7,xmm7,xmm3
+ add edi,ebp
+ and esi,ebx
+ vmovdqa xmm3,[112+esp]
+ xor ebx,ecx
+ add edi,eax
+ vpalignr xmm2,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ shrd eax,eax,7
+ xor esi,ecx
+ mov ebp,edi
+ add edx,DWORD [esp]
+ vpxor xmm0,xmm0,xmm1
+ vmovdqa [80+esp],xmm4
+ xor eax,ebx
+ shld edi,edi,5
+ vmovdqa xmm4,xmm3
+ vpaddd xmm3,xmm3,xmm7
+ add edx,esi
+ and ebp,eax
+ vpxor xmm0,xmm0,xmm2
+ xor eax,ebx
+ add edx,edi
+ shrd edi,edi,7
+ xor ebp,ebx
+ vpsrld xmm2,xmm0,30
+ vmovdqa [48+esp],xmm3
+ mov esi,edx
+ add ecx,DWORD [4+esp]
+ xor edi,eax
+ shld edx,edx,5
+ vpslld xmm0,xmm0,2
+ add ecx,ebp
+ and esi,edi
+ xor edi,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ mov ebp,ecx
+ add ebx,DWORD [8+esp]
+ vpor xmm0,xmm0,xmm2
+ xor edx,edi
+ shld ecx,ecx,5
+ vmovdqa xmm2,[96+esp]
+ add ebx,esi
+ and ebp,edx
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [12+esp]
+ xor ebp,edi
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,ebp
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpalignr xmm3,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add edi,DWORD [16+esp]
+ xor esi,ecx
+ mov ebp,eax
+ shld eax,eax,5
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa [96+esp],xmm5
+ add edi,esi
+ xor ebp,ecx
+ vmovdqa xmm5,xmm4
+ vpaddd xmm4,xmm4,xmm0
+ shrd ebx,ebx,7
+ add edi,eax
+ vpxor xmm1,xmm1,xmm3
+ add edx,DWORD [20+esp]
+ xor ebp,ebx
+ mov esi,edi
+ shld edi,edi,5
+ vpsrld xmm3,xmm1,30
+ vmovdqa [esp],xmm4
+ add edx,ebp
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,edi
+ vpslld xmm1,xmm1,2
+ add ecx,DWORD [24+esp]
+ xor esi,eax
+ mov ebp,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor ebp,eax
+ shrd edi,edi,7
+ add ecx,edx
+ vpor xmm1,xmm1,xmm3
+ add ebx,DWORD [28+esp]
+ xor ebp,edi
+ vmovdqa xmm3,[64+esp]
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,ebp
+ xor esi,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ vpalignr xmm4,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add eax,DWORD [32+esp]
+ xor esi,edx
+ mov ebp,ebx
+ shld ebx,ebx,5
+ vpxor xmm2,xmm2,xmm3
+ vmovdqa [64+esp],xmm6
+ add eax,esi
+ xor ebp,edx
+ vmovdqa xmm6,[128+esp]
+ vpaddd xmm5,xmm5,xmm1
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpxor xmm2,xmm2,xmm4
+ add edi,DWORD [36+esp]
+ xor ebp,ecx
+ mov esi,eax
+ shld eax,eax,5
+ vpsrld xmm4,xmm2,30
+ vmovdqa [16+esp],xmm5
+ add edi,ebp
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ vpslld xmm2,xmm2,2
+ add edx,DWORD [40+esp]
+ xor esi,ebx
+ mov ebp,edi
+ shld edi,edi,5
+ add edx,esi
+ xor ebp,ebx
+ shrd eax,eax,7
+ add edx,edi
+ vpor xmm2,xmm2,xmm4
+ add ecx,DWORD [44+esp]
+ xor ebp,eax
+ vmovdqa xmm4,[80+esp]
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,ebp
+ xor esi,eax
+ shrd edi,edi,7
+ add ecx,edx
+ vpalignr xmm5,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add ebx,DWORD [48+esp]
+ xor esi,edi
+ mov ebp,ecx
+ shld ecx,ecx,5
+ vpxor xmm3,xmm3,xmm4
+ vmovdqa [80+esp],xmm7
+ add ebx,esi
+ xor ebp,edi
+ vmovdqa xmm7,xmm6
+ vpaddd xmm6,xmm6,xmm2
+ shrd edx,edx,7
+ add ebx,ecx
+ vpxor xmm3,xmm3,xmm5
+ add eax,DWORD [52+esp]
+ xor ebp,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ vpsrld xmm5,xmm3,30
+ vmovdqa [32+esp],xmm6
+ add eax,ebp
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpslld xmm3,xmm3,2
+ add edi,DWORD [56+esp]
+ xor esi,ecx
+ mov ebp,eax
+ shld eax,eax,5
+ add edi,esi
+ xor ebp,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ vpor xmm3,xmm3,xmm5
+ add edx,DWORD [60+esp]
+ xor ebp,ebx
+ vmovdqa xmm5,[96+esp]
+ mov esi,edi
+ shld edi,edi,5
+ add edx,ebp
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,edi
+ vpalignr xmm6,xmm3,xmm2,8
+ vpxor xmm4,xmm4,xmm0
+ add ecx,DWORD [esp]
+ xor esi,eax
+ mov ebp,edx
+ shld edx,edx,5
+ vpxor xmm4,xmm4,xmm5
+ vmovdqa [96+esp],xmm0
+ add ecx,esi
+ xor ebp,eax
+ vmovdqa xmm0,xmm7
+ vpaddd xmm7,xmm7,xmm3
+ shrd edi,edi,7
+ add ecx,edx
+ vpxor xmm4,xmm4,xmm6
+ add ebx,DWORD [4+esp]
+ xor ebp,edi
+ mov esi,ecx
+ shld ecx,ecx,5
+ vpsrld xmm6,xmm4,30
+ vmovdqa [48+esp],xmm7
+ add ebx,ebp
+ xor esi,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ vpslld xmm4,xmm4,2
+ add eax,DWORD [8+esp]
+ xor esi,edx
+ mov ebp,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor ebp,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpor xmm4,xmm4,xmm6
+ add edi,DWORD [12+esp]
+ xor ebp,ecx
+ vmovdqa xmm6,[64+esp]
+ mov esi,eax
+ shld eax,eax,5
+ add edi,ebp
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ vpalignr xmm7,xmm4,xmm3,8
+ vpxor xmm5,xmm5,xmm1
+ add edx,DWORD [16+esp]
+ xor esi,ebx
+ mov ebp,edi
+ shld edi,edi,5
+ vpxor xmm5,xmm5,xmm6
+ vmovdqa [64+esp],xmm1
+ add edx,esi
+ xor ebp,ebx
+ vmovdqa xmm1,xmm0
+ vpaddd xmm0,xmm0,xmm4
+ shrd eax,eax,7
+ add edx,edi
+ vpxor xmm5,xmm5,xmm7
+ add ecx,DWORD [20+esp]
+ xor ebp,eax
+ mov esi,edx
+ shld edx,edx,5
+ vpsrld xmm7,xmm5,30
+ vmovdqa [esp],xmm0
+ add ecx,ebp
+ xor esi,eax
+ shrd edi,edi,7
+ add ecx,edx
+ vpslld xmm5,xmm5,2
+ add ebx,DWORD [24+esp]
+ xor esi,edi
+ mov ebp,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor ebp,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ vpor xmm5,xmm5,xmm7
+ add eax,DWORD [28+esp]
+ vmovdqa xmm7,[80+esp]
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor ebp,edx
+ shld ebx,ebx,5
+ add eax,ebp
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpalignr xmm0,xmm5,xmm4,8
+ vpxor xmm6,xmm6,xmm2
+ add edi,DWORD [32+esp]
+ and esi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ vpxor xmm6,xmm6,xmm7
+ vmovdqa [80+esp],xmm2
+ mov ebp,eax
+ xor esi,ecx
+ vmovdqa xmm2,xmm1
+ vpaddd xmm1,xmm1,xmm5
+ shld eax,eax,5
+ add edi,esi
+ vpxor xmm6,xmm6,xmm0
+ xor ebp,ebx
+ xor ebx,ecx
+ add edi,eax
+ add edx,DWORD [36+esp]
+ vpsrld xmm0,xmm6,30
+ vmovdqa [16+esp],xmm1
+ and ebp,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,edi
+ vpslld xmm6,xmm6,2
+ xor ebp,ebx
+ shld edi,edi,5
+ add edx,ebp
+ xor esi,eax
+ xor eax,ebx
+ add edx,edi
+ add ecx,DWORD [40+esp]
+ and esi,eax
+ vpor xmm6,xmm6,xmm0
+ xor eax,ebx
+ shrd edi,edi,7
+ vmovdqa xmm0,[96+esp]
+ mov ebp,edx
+ xor esi,eax
+ shld edx,edx,5
+ add ecx,esi
+ xor ebp,edi
+ xor edi,eax
+ add ecx,edx
+ add ebx,DWORD [44+esp]
+ and ebp,edi
+ xor edi,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ xor ebp,edi
+ shld ecx,ecx,5
+ add ebx,ebp
+ xor esi,edx
+ xor edx,edi
+ add ebx,ecx
+ vpalignr xmm1,xmm6,xmm5,8
+ vpxor xmm7,xmm7,xmm3
+ add eax,DWORD [48+esp]
+ and esi,edx
+ xor edx,edi
+ shrd ecx,ecx,7
+ vpxor xmm7,xmm7,xmm0
+ vmovdqa [96+esp],xmm3
+ mov ebp,ebx
+ xor esi,edx
+ vmovdqa xmm3,[144+esp]
+ vpaddd xmm2,xmm2,xmm6
+ shld ebx,ebx,5
+ add eax,esi
+ vpxor xmm7,xmm7,xmm1
+ xor ebp,ecx
+ xor ecx,edx
+ add eax,ebx
+ add edi,DWORD [52+esp]
+ vpsrld xmm1,xmm7,30
+ vmovdqa [32+esp],xmm2
+ and ebp,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ vpslld xmm7,xmm7,2
+ xor ebp,ecx
+ shld eax,eax,5
+ add edi,ebp
+ xor esi,ebx
+ xor ebx,ecx
+ add edi,eax
+ add edx,DWORD [56+esp]
+ and esi,ebx
+ vpor xmm7,xmm7,xmm1
+ xor ebx,ecx
+ shrd eax,eax,7
+ vmovdqa xmm1,[64+esp]
+ mov ebp,edi
+ xor esi,ebx
+ shld edi,edi,5
+ add edx,esi
+ xor ebp,eax
+ xor eax,ebx
+ add edx,edi
+ add ecx,DWORD [60+esp]
+ and ebp,eax
+ xor eax,ebx
+ shrd edi,edi,7
+ mov esi,edx
+ xor ebp,eax
+ shld edx,edx,5
+ add ecx,ebp
+ xor esi,edi
+ xor edi,eax
+ add ecx,edx
+ vpalignr xmm2,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ add ebx,DWORD [esp]
+ and esi,edi
+ xor edi,eax
+ shrd edx,edx,7
+ vpxor xmm0,xmm0,xmm1
+ vmovdqa [64+esp],xmm4
+ mov ebp,ecx
+ xor esi,edi
+ vmovdqa xmm4,xmm3
+ vpaddd xmm3,xmm3,xmm7
+ shld ecx,ecx,5
+ add ebx,esi
+ vpxor xmm0,xmm0,xmm2
+ xor ebp,edx
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [4+esp]
+ vpsrld xmm2,xmm0,30
+ vmovdqa [48+esp],xmm3
+ and ebp,edx
+ xor edx,edi
+ shrd ecx,ecx,7
+ mov esi,ebx
+ vpslld xmm0,xmm0,2
+ xor ebp,edx
+ shld ebx,ebx,5
+ add eax,ebp
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add edi,DWORD [8+esp]
+ and esi,ecx
+ vpor xmm0,xmm0,xmm2
+ xor ecx,edx
+ shrd ebx,ebx,7
+ vmovdqa xmm2,[80+esp]
+ mov ebp,eax
+ xor esi,ecx
+ shld eax,eax,5
+ add edi,esi
+ xor ebp,ebx
+ xor ebx,ecx
+ add edi,eax
+ add edx,DWORD [12+esp]
+ and ebp,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,edi
+ xor ebp,ebx
+ shld edi,edi,5
+ add edx,ebp
+ xor esi,eax
+ xor eax,ebx
+ add edx,edi
+ vpalignr xmm3,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add ecx,DWORD [16+esp]
+ and esi,eax
+ xor eax,ebx
+ shrd edi,edi,7
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa [80+esp],xmm5
+ mov ebp,edx
+ xor esi,eax
+ vmovdqa xmm5,xmm4
+ vpaddd xmm4,xmm4,xmm0
+ shld edx,edx,5
+ add ecx,esi
+ vpxor xmm1,xmm1,xmm3
+ xor ebp,edi
+ xor edi,eax
+ add ecx,edx
+ add ebx,DWORD [20+esp]
+ vpsrld xmm3,xmm1,30
+ vmovdqa [esp],xmm4
+ and ebp,edi
+ xor edi,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ vpslld xmm1,xmm1,2
+ xor ebp,edi
+ shld ecx,ecx,5
+ add ebx,ebp
+ xor esi,edx
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [24+esp]
+ and esi,edx
+ vpor xmm1,xmm1,xmm3
+ xor edx,edi
+ shrd ecx,ecx,7
+ vmovdqa xmm3,[96+esp]
+ mov ebp,ebx
+ xor esi,edx
+ shld ebx,ebx,5
+ add eax,esi
+ xor ebp,ecx
+ xor ecx,edx
+ add eax,ebx
+ add edi,DWORD [28+esp]
+ and ebp,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ xor ebp,ecx
+ shld eax,eax,5
+ add edi,ebp
+ xor esi,ebx
+ xor ebx,ecx
+ add edi,eax
+ vpalignr xmm4,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add edx,DWORD [32+esp]
+ and esi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ vpxor xmm2,xmm2,xmm3
+ vmovdqa [96+esp],xmm6
+ mov ebp,edi
+ xor esi,ebx
+ vmovdqa xmm6,xmm5
+ vpaddd xmm5,xmm5,xmm1
+ shld edi,edi,5
+ add edx,esi
+ vpxor xmm2,xmm2,xmm4
+ xor ebp,eax
+ xor eax,ebx
+ add edx,edi
+ add ecx,DWORD [36+esp]
+ vpsrld xmm4,xmm2,30
+ vmovdqa [16+esp],xmm5
+ and ebp,eax
+ xor eax,ebx
+ shrd edi,edi,7
+ mov esi,edx
+ vpslld xmm2,xmm2,2
+ xor ebp,eax
+ shld edx,edx,5
+ add ecx,ebp
+ xor esi,edi
+ xor edi,eax
+ add ecx,edx
+ add ebx,DWORD [40+esp]
+ and esi,edi
+ vpor xmm2,xmm2,xmm4
+ xor edi,eax
+ shrd edx,edx,7
+ vmovdqa xmm4,[64+esp]
+ mov ebp,ecx
+ xor esi,edi
+ shld ecx,ecx,5
+ add ebx,esi
+ xor ebp,edx
+ xor edx,edi
+ add ebx,ecx
+ add eax,DWORD [44+esp]
+ and ebp,edx
+ xor edx,edi
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor ebp,edx
+ shld ebx,ebx,5
+ add eax,ebp
+ xor esi,edx
+ add eax,ebx
+ vpalignr xmm5,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add edi,DWORD [48+esp]
+ xor esi,ecx
+ mov ebp,eax
+ shld eax,eax,5
+ vpxor xmm3,xmm3,xmm4
+ vmovdqa [64+esp],xmm7
+ add edi,esi
+ xor ebp,ecx
+ vmovdqa xmm7,xmm6
+ vpaddd xmm6,xmm6,xmm2
+ shrd ebx,ebx,7
+ add edi,eax
+ vpxor xmm3,xmm3,xmm5
+ add edx,DWORD [52+esp]
+ xor ebp,ebx
+ mov esi,edi
+ shld edi,edi,5
+ vpsrld xmm5,xmm3,30
+ vmovdqa [32+esp],xmm6
+ add edx,ebp
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,edi
+ vpslld xmm3,xmm3,2
+ add ecx,DWORD [56+esp]
+ xor esi,eax
+ mov ebp,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor ebp,eax
+ shrd edi,edi,7
+ add ecx,edx
+ vpor xmm3,xmm3,xmm5
+ add ebx,DWORD [60+esp]
+ xor ebp,edi
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,ebp
+ xor esi,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD [esp]
+ vpaddd xmm7,xmm7,xmm3
+ xor esi,edx
+ mov ebp,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ vmovdqa [48+esp],xmm7
+ xor ebp,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add edi,DWORD [4+esp]
+ xor ebp,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add edi,ebp
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ add edx,DWORD [8+esp]
+ xor esi,ebx
+ mov ebp,edi
+ shld edi,edi,5
+ add edx,esi
+ xor ebp,ebx
+ shrd eax,eax,7
+ add edx,edi
+ add ecx,DWORD [12+esp]
+ xor ebp,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,ebp
+ xor esi,eax
+ shrd edi,edi,7
+ add ecx,edx
+ mov ebp,DWORD [196+esp]
+ cmp ebp,DWORD [200+esp]
+ je NEAR L$006done
+ vmovdqa xmm7,[160+esp]
+ vmovdqa xmm6,[176+esp]
+ vmovdqu xmm0,[ebp]
+ vmovdqu xmm1,[16+ebp]
+ vmovdqu xmm2,[32+ebp]
+ vmovdqu xmm3,[48+ebp]
+ add ebp,64
+ vpshufb xmm0,xmm0,xmm6
+ mov DWORD [196+esp],ebp
+ vmovdqa [96+esp],xmm7
+ add ebx,DWORD [16+esp]
+ xor esi,edi
+ vpshufb xmm1,xmm1,xmm6
+ mov ebp,ecx
+ shld ecx,ecx,5
+ vpaddd xmm4,xmm0,xmm7
+ add ebx,esi
+ xor ebp,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ vmovdqa [esp],xmm4
+ add eax,DWORD [20+esp]
+ xor ebp,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,ebp
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add edi,DWORD [24+esp]
+ xor esi,ecx
+ mov ebp,eax
+ shld eax,eax,5
+ add edi,esi
+ xor ebp,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ add edx,DWORD [28+esp]
+ xor ebp,ebx
+ mov esi,edi
+ shld edi,edi,5
+ add edx,ebp
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,edi
+ add ecx,DWORD [32+esp]
+ xor esi,eax
+ vpshufb xmm2,xmm2,xmm6
+ mov ebp,edx
+ shld edx,edx,5
+ vpaddd xmm5,xmm1,xmm7
+ add ecx,esi
+ xor ebp,eax
+ shrd edi,edi,7
+ add ecx,edx
+ vmovdqa [16+esp],xmm5
+ add ebx,DWORD [36+esp]
+ xor ebp,edi
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,ebp
+ xor esi,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD [40+esp]
+ xor esi,edx
+ mov ebp,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor ebp,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add edi,DWORD [44+esp]
+ xor ebp,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add edi,ebp
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ add edx,DWORD [48+esp]
+ xor esi,ebx
+ vpshufb xmm3,xmm3,xmm6
+ mov ebp,edi
+ shld edi,edi,5
+ vpaddd xmm6,xmm2,xmm7
+ add edx,esi
+ xor ebp,ebx
+ shrd eax,eax,7
+ add edx,edi
+ vmovdqa [32+esp],xmm6
+ add ecx,DWORD [52+esp]
+ xor ebp,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,ebp
+ xor esi,eax
+ shrd edi,edi,7
+ add ecx,edx
+ add ebx,DWORD [56+esp]
+ xor esi,edi
+ mov ebp,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor ebp,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD [60+esp]
+ xor ebp,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,ebp
+ shrd ecx,ecx,7
+ add eax,ebx
+ mov ebp,DWORD [192+esp]
+ add eax,DWORD [ebp]
+ add esi,DWORD [4+ebp]
+ add ecx,DWORD [8+ebp]
+ mov DWORD [ebp],eax
+ add edx,DWORD [12+ebp]
+ mov DWORD [4+ebp],esi
+ add edi,DWORD [16+ebp]
+ mov ebx,ecx
+ mov DWORD [8+ebp],ecx
+ xor ebx,edx
+ mov DWORD [12+ebp],edx
+ mov DWORD [16+ebp],edi
+ mov ebp,esi
+ and esi,ebx
+ mov ebx,ebp
+ jmp NEAR L$005loop
+align 16
+L$006done:
+ add ebx,DWORD [16+esp]
+ xor esi,edi
+ mov ebp,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor ebp,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD [20+esp]
+ xor ebp,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,ebp
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add edi,DWORD [24+esp]
+ xor esi,ecx
+ mov ebp,eax
+ shld eax,eax,5
+ add edi,esi
+ xor ebp,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ add edx,DWORD [28+esp]
+ xor ebp,ebx
+ mov esi,edi
+ shld edi,edi,5
+ add edx,ebp
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,edi
+ add ecx,DWORD [32+esp]
+ xor esi,eax
+ mov ebp,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor ebp,eax
+ shrd edi,edi,7
+ add ecx,edx
+ add ebx,DWORD [36+esp]
+ xor ebp,edi
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,ebp
+ xor esi,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD [40+esp]
+ xor esi,edx
+ mov ebp,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor ebp,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add edi,DWORD [44+esp]
+ xor ebp,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add edi,ebp
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add edi,eax
+ add edx,DWORD [48+esp]
+ xor esi,ebx
+ mov ebp,edi
+ shld edi,edi,5
+ add edx,esi
+ xor ebp,ebx
+ shrd eax,eax,7
+ add edx,edi
+ add ecx,DWORD [52+esp]
+ xor ebp,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,ebp
+ xor esi,eax
+ shrd edi,edi,7
+ add ecx,edx
+ add ebx,DWORD [56+esp]
+ xor esi,edi
+ mov ebp,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor ebp,edi
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD [60+esp]
+ xor ebp,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,ebp
+ shrd ecx,ecx,7
+ add eax,ebx
+ vzeroall
+ mov ebp,DWORD [192+esp]
+ add eax,DWORD [ebp]
+ mov esp,DWORD [204+esp]
+ add esi,DWORD [4+ebp]
+ add ecx,DWORD [8+ebp]
+ mov DWORD [ebp],eax
+ add edx,DWORD [12+ebp]
+ mov DWORD [4+ebp],esi
+ add edi,DWORD [16+ebp]
+ mov DWORD [8+ebp],ecx
+ mov DWORD [12+ebp],edx
+ mov DWORD [16+ebp],edi
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 64
+L$K_XX_XX:
+dd 1518500249,1518500249,1518500249,1518500249
+dd 1859775393,1859775393,1859775393,1859775393
+dd 2400959708,2400959708,2400959708,2400959708
+dd 3395469782,3395469782,3395469782,3395469782
+dd 66051,67438087,134810123,202182159
+db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+db 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+db 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+db 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+db 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha1-armv4-large-linux.S b/gen/bcm/sha1-armv4-large-linux.S
new file mode 100644
index 0000000..323e6e6
--- /dev/null
+++ b/gen/bcm/sha1-armv4-large-linux.S
@@ -0,0 +1,1481 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl sha1_block_data_order_nohw
+.hidden sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,%function
+
+.align 5
+sha1_block_data_order_nohw:
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ ldmia r0,{r3,r4,r5,r6,r7}
+.Lloop:
+ ldr r8,.LK_00_19
+ mov r14,sp
+ sub sp,sp,#15*4
+ mov r5,r5,ror#30
+ mov r6,r6,ror#30
+ mov r7,r7,ror#30 @ [6]
+.L_00_15:
+#if __ARM_ARCH<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r4,r5 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ eor r10,r4,r5 @ F_xx_xx
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r3,r10,ror#2
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r3,r4 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ eor r10,r3,r4 @ F_xx_xx
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r7,r10,ror#2
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r7,r3 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ eor r10,r7,r3 @ F_xx_xx
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r6,r10,ror#2
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r6,r7 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ eor r10,r6,r7 @ F_xx_xx
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r5,r10,ror#2
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp
+#endif
+ bne .L_00_15 @ [((11+4)*5+2)*3]
+ sub sp,sp,#25*4
+#if __ARM_ARCH<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+
+ ldr r8,.LK_20_39 @ [+15+16*4]
+ cmn sp,#0 @ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r4,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_20_39(B,C,D)
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp @ preserve carry
+#endif
+ bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
+ bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
+
+ ldr r8,.LK_40_59
+ sub sp,sp,#20*4 @ [+2]
+.L_40_59:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r4,r10,ror#2 @ F_xx_xx
+ and r11,r5,r6 @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_40_59(B,C,D)
+ add r7,r7,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ and r11,r4,r5 @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_40_59(B,C,D)
+ add r6,r6,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ and r11,r3,r4 @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_40_59(B,C,D)
+ add r5,r5,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ and r11,r7,r3 @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_40_59(B,C,D)
+ add r4,r4,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ and r11,r6,r7 @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_40_59(B,C,D)
+ add r3,r3,r11,ror#2
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp
+#endif
+ bne .L_40_59 @ [+((12+5)*5+2)*4]
+
+ ldr r8,.LK_60_79
+ sub sp,sp,#20*4
+ cmp sp,#0 @ set carry to denote 60_79
+ b .L_20_39_or_60_79 @ [+4], spare 300 bytes
+.L_done:
+ add sp,sp,#80*4 @ "deallocate" stack frame
+ ldmia r0,{r8,r9,r10,r11,r12}
+ add r3,r8,r3
+ add r4,r9,r4
+ add r5,r10,r5,ror#2
+ add r6,r11,r6,ror#2
+ add r7,r12,r7,ror#2
+ stmia r0,{r3,r4,r5,r6,r7}
+ teq r1,r2
+ bne .Lloop @ [+18], total 1307
+
+#if __ARM_ARCH>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+
+.align 5
+.LK_00_19:.word 0x5a827999
+.LK_20_39:.word 0x6ed9eba1
+.LK_40_59:.word 0x8f1bbcdc
+.LK_60_79:.word 0xca62c1d6
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 5
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl sha1_block_data_order_neon
+.hidden sha1_block_data_order_neon
+.type sha1_block_data_order_neon,%function
+.align 4
+sha1_block_data_order_neon:
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ @ dmb @ errata #451034 on early Cortex A8
+ @ vstmdb sp!,{d8-d15} @ ABI specification says so
+ mov r14,sp
+ sub r12,sp,#64
+ adr r8,.LK_00_19
+ bic r12,r12,#15 @ align for 128-bit stores
+
+ ldmia r0,{r3,r4,r5,r6,r7} @ load context
+ mov sp,r12 @ alloca
+
+ vld1.8 {q0,q1},[r1]! @ handles unaligned
+ veor q15,q15,q15
+ vld1.8 {q2,q3},[r1]!
+ vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19
+ vrev32.8 q0,q0 @ yes, even on
+ vrev32.8 q1,q1 @ big-endian...
+ vrev32.8 q2,q2
+ vadd.i32 q8,q0,q14
+ vrev32.8 q3,q3
+ vadd.i32 q9,q1,q14
+ vst1.32 {q8},[r12,:128]!
+ vadd.i32 q10,q2,q14
+ vst1.32 {q9},[r12,:128]!
+ vst1.32 {q10},[r12,:128]!
+ ldr r9,[sp] @ big RAW stall
+
+.Loop_neon:
+ vext.8 q8,q0,q1,#8
+ bic r10,r6,r4
+ add r7,r7,r9
+ and r11,r5,r4
+ vadd.i32 q13,q3,q14
+ ldr r9,[sp,#4]
+ add r7,r7,r3,ror#27
+ vext.8 q12,q3,q15,#4
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ veor q8,q8,q0
+ bic r10,r5,r3
+ add r6,r6,r9
+ veor q12,q12,q2
+ and r11,r4,r3
+ ldr r9,[sp,#8]
+ veor q12,q12,q8
+ add r6,r6,r7,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r4,r7
+ add r5,r5,r9
+ vadd.i32 q8,q12,q12
+ and r11,r3,r7
+ ldr r9,[sp,#12]
+ vsri.32 q8,q12,#31
+ add r5,r5,r6,ror#27
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ vshr.u32 q12,q13,#30
+ add r5,r5,r11
+ bic r10,r3,r6
+ vshl.u32 q13,q13,#2
+ add r4,r4,r9
+ and r11,r7,r6
+ veor q8,q8,q12
+ ldr r9,[sp,#16]
+ add r4,r4,r5,ror#27
+ veor q8,q8,q13
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q9,q1,q2,#8
+ bic r10,r7,r5
+ add r3,r3,r9
+ and r11,r6,r5
+ vadd.i32 q13,q8,q14
+ ldr r9,[sp,#20]
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r4,ror#27
+ vext.8 q12,q8,q15,#4
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ veor q9,q9,q1
+ bic r10,r6,r4
+ add r7,r7,r9
+ veor q12,q12,q3
+ and r11,r5,r4
+ ldr r9,[sp,#24]
+ veor q12,q12,q9
+ add r7,r7,r3,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r5,r3
+ add r6,r6,r9
+ vadd.i32 q9,q12,q12
+ and r11,r4,r3
+ ldr r9,[sp,#28]
+ vsri.32 q9,q12,#31
+ add r6,r6,r7,ror#27
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ vshr.u32 q12,q13,#30
+ add r6,r6,r11
+ bic r10,r4,r7
+ vshl.u32 q13,q13,#2
+ add r5,r5,r9
+ and r11,r3,r7
+ veor q9,q9,q12
+ ldr r9,[sp,#32]
+ add r5,r5,r6,ror#27
+ veor q9,q9,q13
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q10,q2,q3,#8
+ bic r10,r3,r6
+ add r4,r4,r9
+ and r11,r7,r6
+ vadd.i32 q13,q9,q14
+ ldr r9,[sp,#36]
+ add r4,r4,r5,ror#27
+ vext.8 q12,q9,q15,#4
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ veor q10,q10,q2
+ bic r10,r7,r5
+ add r3,r3,r9
+ veor q12,q12,q8
+ and r11,r6,r5
+ ldr r9,[sp,#40]
+ veor q12,q12,q10
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r6,r4
+ add r7,r7,r9
+ vadd.i32 q10,q12,q12
+ and r11,r5,r4
+ ldr r9,[sp,#44]
+ vsri.32 q10,q12,#31
+ add r7,r7,r3,ror#27
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ vshr.u32 q12,q13,#30
+ add r7,r7,r11
+ bic r10,r5,r3
+ vshl.u32 q13,q13,#2
+ add r6,r6,r9
+ and r11,r4,r3
+ veor q10,q10,q12
+ ldr r9,[sp,#48]
+ add r6,r6,r7,ror#27
+ veor q10,q10,q13
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q11,q3,q8,#8
+ bic r10,r4,r7
+ add r5,r5,r9
+ and r11,r3,r7
+ vadd.i32 q13,q10,q14
+ ldr r9,[sp,#52]
+ add r5,r5,r6,ror#27
+ vext.8 q12,q10,q15,#4
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ veor q11,q11,q3
+ bic r10,r3,r6
+ add r4,r4,r9
+ veor q12,q12,q9
+ and r11,r7,r6
+ ldr r9,[sp,#56]
+ veor q12,q12,q11
+ add r4,r4,r5,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r7,r5
+ add r3,r3,r9
+ vadd.i32 q11,q12,q12
+ and r11,r6,r5
+ ldr r9,[sp,#60]
+ vsri.32 q11,q12,#31
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ vshr.u32 q12,q13,#30
+ add r3,r3,r11
+ bic r10,r6,r4
+ vshl.u32 q13,q13,#2
+ add r7,r7,r9
+ and r11,r5,r4
+ veor q11,q11,q12
+ ldr r9,[sp,#0]
+ add r7,r7,r3,ror#27
+ veor q11,q11,q13
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q10,q11,#8
+ bic r10,r5,r3
+ add r6,r6,r9
+ and r11,r4,r3
+ veor q0,q0,q8
+ ldr r9,[sp,#4]
+ add r6,r6,r7,ror#27
+ veor q0,q0,q1
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ vadd.i32 q13,q11,q14
+ add r6,r6,r11
+ bic r10,r4,r7
+ veor q12,q12,q0
+ add r5,r5,r9
+ and r11,r3,r7
+ vshr.u32 q0,q12,#30
+ ldr r9,[sp,#8]
+ add r5,r5,r6,ror#27
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ vsli.32 q0,q12,#2
+ add r5,r5,r11
+ bic r10,r3,r6
+ add r4,r4,r9
+ and r11,r7,r6
+ ldr r9,[sp,#12]
+ add r4,r4,r5,ror#27
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ bic r10,r7,r5
+ add r3,r3,r9
+ and r11,r6,r5
+ ldr r9,[sp,#16]
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q11,q0,#8
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#20]
+ veor q1,q1,q9
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ veor q1,q1,q2
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vadd.i32 q13,q0,q14
+ eor r10,r3,r5
+ add r6,r6,r9
+ veor q12,q12,q1
+ ldr r9,[sp,#24]
+ eor r11,r10,r4
+ vshr.u32 q1,q12,#30
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r6,r6,r11
+ eor r10,r7,r4
+ vsli.32 q1,q12,#2
+ add r5,r5,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q12,q0,q1,#8
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#36]
+ veor q2,q2,q10
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ veor q2,q2,q3
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vadd.i32 q13,q1,q14
+ eor r10,r4,r6
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r7,r7,r9
+ veor q12,q12,q2
+ ldr r9,[sp,#40]
+ eor r11,r10,r5
+ vshr.u32 q2,q12,#30
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r7,r7,r11
+ eor r10,r3,r5
+ vsli.32 q2,q12,#2
+ add r6,r6,r9
+ ldr r9,[sp,#44]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#48]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q12,q1,q2,#8
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#52]
+ veor q3,q3,q11
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ veor q3,q3,q8
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vadd.i32 q13,q2,q14
+ eor r10,r5,r7
+ add r3,r3,r9
+ veor q12,q12,q3
+ ldr r9,[sp,#56]
+ eor r11,r10,r6
+ vshr.u32 q3,q12,#30
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r3,r3,r11
+ eor r10,r4,r6
+ vsli.32 q3,q12,#2
+ add r7,r7,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#0]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q12,q2,q3,#8
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#4]
+ veor q8,q8,q0
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ veor q8,q8,q9
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vadd.i32 q13,q3,q14
+ eor r10,r6,r3
+ add r4,r4,r9
+ veor q12,q12,q8
+ ldr r9,[sp,#8]
+ eor r11,r10,r7
+ vshr.u32 q8,q12,#30
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ add r4,r4,r11
+ eor r10,r5,r7
+ vsli.32 q8,q12,#2
+ add r3,r3,r9
+ ldr r9,[sp,#12]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#16]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q3,q8,#8
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#20]
+ veor q9,q9,q1
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ veor q9,q9,q10
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vadd.i32 q13,q8,q14
+ eor r10,r7,r4
+ add r5,r5,r9
+ veor q12,q12,q9
+ ldr r9,[sp,#24]
+ eor r11,r10,r3
+ vshr.u32 q9,q12,#30
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r5,r5,r11
+ eor r10,r6,r3
+ vsli.32 q9,q12,#2
+ add r4,r4,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q8,q9,#8
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#36]
+ veor q10,q10,q2
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ veor q10,q10,q11
+ add r7,r7,r10
+ and r11,r11,r4
+ vadd.i32 q13,q9,q14
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ veor q12,q12,q10
+ add r6,r6,r9
+ and r10,r4,r5
+ vshr.u32 q10,q12,#30
+ ldr r9,[sp,#40]
+ add r6,r6,r7,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r4,r5
+ add r6,r6,r10
+ vsli.32 q10,q12,#2
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#44]
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ add r5,r5,r10
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#48]
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ add r4,r4,r10
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q12,q9,q10,#8
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#52]
+ veor q11,q11,q3
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ veor q11,q11,q0
+ add r3,r3,r10
+ and r11,r11,r5
+ vadd.i32 q13,q10,q14
+ mov r5,r5,ror#2
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r11
+ veor q12,q12,q11
+ add r7,r7,r9
+ and r10,r5,r6
+ vshr.u32 q11,q12,#30
+ ldr r9,[sp,#56]
+ add r7,r7,r3,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r5,r6
+ add r7,r7,r10
+ vsli.32 q11,q12,#2
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#60]
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ add r6,r6,r10
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#0]
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ add r5,r5,r10
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q12,q10,q11,#8
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#4]
+ veor q0,q0,q8
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ veor q0,q0,q1
+ add r4,r4,r10
+ and r11,r11,r6
+ vadd.i32 q13,q11,q14
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ veor q12,q12,q0
+ add r3,r3,r9
+ and r10,r6,r7
+ vshr.u32 q0,q12,#30
+ ldr r9,[sp,#8]
+ add r3,r3,r4,ror#27
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ eor r11,r6,r7
+ add r3,r3,r10
+ vsli.32 q0,q12,#2
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#12]
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ add r7,r7,r10
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#16]
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ add r6,r6,r10
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q12,q11,q0,#8
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#20]
+ veor q1,q1,q9
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ veor q1,q1,q2
+ add r5,r5,r10
+ and r11,r11,r7
+ vadd.i32 q13,q0,q14
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ veor q12,q12,q1
+ add r4,r4,r9
+ and r10,r7,r3
+ vshr.u32 q1,q12,#30
+ ldr r9,[sp,#24]
+ add r4,r4,r5,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r7,r3
+ add r4,r4,r10
+ vsli.32 q1,q12,#2
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#28]
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ add r3,r3,r10
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#32]
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ add r7,r7,r10
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q0,q1,#8
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#36]
+ veor q2,q2,q10
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ veor q2,q2,q3
+ add r6,r6,r10
+ and r11,r11,r3
+ vadd.i32 q13,q1,q14
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ veor q12,q12,q2
+ add r5,r5,r9
+ and r10,r3,r4
+ vshr.u32 q2,q12,#30
+ ldr r9,[sp,#40]
+ add r5,r5,r6,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r3,r4
+ add r5,r5,r10
+ vsli.32 q2,q12,#2
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#44]
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ add r4,r4,r10
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#48]
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ add r3,r3,r10
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q1,q2,#8
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#52]
+ veor q3,q3,q11
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ veor q3,q3,q8
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vadd.i32 q13,q2,q14
+ eor r10,r3,r5
+ add r6,r6,r9
+ veor q12,q12,q3
+ ldr r9,[sp,#56]
+ eor r11,r10,r4
+ vshr.u32 q3,q12,#30
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r6,r6,r11
+ eor r10,r7,r4
+ vsli.32 q3,q12,#2
+ add r5,r5,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#0]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vadd.i32 q13,q3,q14
+ eor r10,r5,r7
+ add r3,r3,r9
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ teq r1,r2
+ sub r8,r8,#16
+ it eq
+ subeq r1,r1,#64
+ vld1.8 {q0,q1},[r1]!
+ ldr r9,[sp,#4]
+ eor r11,r10,r6
+ vld1.8 {q2,q3},[r1]!
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r11
+ eor r10,r4,r6
+ vrev32.8 q0,q0
+ add r7,r7,r9
+ ldr r9,[sp,#8]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#12]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#16]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vrev32.8 q1,q1
+ eor r10,r6,r3
+ add r4,r4,r9
+ vadd.i32 q8,q0,q14
+ ldr r9,[sp,#20]
+ eor r11,r10,r7
+ vst1.32 {q8},[r12,:128]!
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#24]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vrev32.8 q2,q2
+ eor r10,r7,r4
+ add r5,r5,r9
+ vadd.i32 q9,q1,q14
+ ldr r9,[sp,#36]
+ eor r11,r10,r3
+ vst1.32 {q9},[r12,:128]!
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#40]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#44]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#48]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vrev32.8 q3,q3
+ eor r10,r3,r5
+ add r6,r6,r9
+ vadd.i32 q10,q2,q14
+ ldr r9,[sp,#52]
+ eor r11,r10,r4
+ vst1.32 {q10},[r12,:128]!
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#56]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ ldmia r0,{r9,r10,r11,r12} @ accumulate context
+ add r3,r3,r9
+ ldr r9,[r0,#16]
+ add r4,r4,r10
+ add r5,r5,r11
+ add r6,r6,r12
+ it eq
+ moveq sp,r14
+ add r7,r7,r9
+ it ne
+ ldrne r9,[sp]
+ stmia r0,{r3,r4,r5,r6,r7}
+ itt ne
+ addne r12,sp,#3*16
+ bne .Loop_neon
+
+ @ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# endif
+
+.globl sha1_block_data_order_hw
+.hidden sha1_block_data_order_hw
+.type sha1_block_data_order_hw,%function
+.align 5
+sha1_block_data_order_hw:
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+
+ veor q1,q1,q1
+ adr r3,.LK_00_19
+ vld1.32 {q0},[r0]!
+ vld1.32 {d2[0]},[r0]
+ sub r0,r0,#16
+ vld1.32 {d16[],d17[]},[r3,:32]!
+ vld1.32 {d18[],d19[]},[r3,:32]!
+ vld1.32 {d20[],d21[]},[r3,:32]!
+ vld1.32 {d22[],d23[]},[r3,:32]
+
+.Loop_v8:
+ vld1.8 {q4,q5},[r1]!
+ vld1.8 {q6,q7},[r1]!
+ vrev32.8 q4,q4
+ vrev32.8 q5,q5
+
+ vadd.i32 q12,q8,q4
+ vrev32.8 q6,q6
+ vmov q14,q0 @ offload
+ subs r2,r2,#1
+
+ vadd.i32 q13,q8,q5
+ vrev32.8 q7,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0
+ INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12
+ vadd.i32 q12,q8,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1
+ INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
+ vadd.i32 q13,q8,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2
+ INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
+ vadd.i32 q12,q8,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3
+ INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
+ vadd.i32 q13,q9,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4
+ INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
+ vadd.i32 q12,q9,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q9,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q9,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q9,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q10,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q10,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q10,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11
+ INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
+ vadd.i32 q13,q10,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q10,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13
+ INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
+ vadd.i32 q13,q11,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q11,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q11,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q11,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q11,q7
+
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+
+ vadd.i32 q1,q1,q2
+ vadd.i32 q0,q0,q14
+ bne .Loop_v8
+
+ vst1.32 {q0},[r0]!
+ vst1.32 {d2[0]},[r0]
+
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ bx lr @ bx lr
+.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/sha1-armv8-apple.S b/gen/bcm/sha1-armv8-apple.S
new file mode 100644
index 0000000..8f84774
--- /dev/null
+++ b/gen/bcm/sha1-armv8-apple.S
@@ -0,0 +1,1218 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl _sha1_block_data_order_nohw
+.private_extern _sha1_block_data_order_nohw
+
+.align 6
+_sha1_block_data_order_nohw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ ldp w20,w21,[x0]
+ ldp w22,w23,[x0,#8]
+ ldr w24,[x0,#16]
+
+Loop:
+ ldr x3,[x1],#64
+ movz w28,#0x7999
+ sub x2,x2,#1
+ movk w28,#0x5a82,lsl#16
+#ifdef __AARCH64EB__
+ ror x3,x3,#32
+#else
+ rev32 x3,x3
+#endif
+ add w24,w24,w28 // warm it up
+ add w24,w24,w3
+ lsr x4,x3,#32
+ ldr x5,[x1,#-56]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w4 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x5,x5,#32
+#else
+ rev32 x5,x5
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w5 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x6,x5,#32
+ ldr x7,[x1,#-48]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w6 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x7,x7,#32
+#else
+ rev32 x7,x7
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w7 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x8,x7,#32
+ ldr x9,[x1,#-40]
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w8 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x9,x9,#32
+#else
+ rev32 x9,x9
+#endif
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w9 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ lsr x10,x9,#32
+ ldr x11,[x1,#-32]
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w10 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x11,x11,#32
+#else
+ rev32 x11,x11
+#endif
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w11 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ lsr x12,x11,#32
+ ldr x13,[x1,#-24]
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w12 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x13,x13,#32
+#else
+ rev32 x13,x13
+#endif
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w13 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ lsr x14,x13,#32
+ ldr x15,[x1,#-16]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w14 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x15,x15,#32
+#else
+ rev32 x15,x15
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w15 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x16,x15,#32
+ ldr x17,[x1,#-8]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w16 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x17,x17,#32
+#else
+ rev32 x17,x17
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w17 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x19,x17,#32
+ eor w3,w3,w5
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w3,w3,w11
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w3,w3,w16
+ ror w22,w22,#2
+ add w24,w24,w19 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ eor w4,w4,w12
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ eor w4,w4,w17
+ ror w21,w21,#2
+ add w23,w23,w3 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ eor w5,w5,w13
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ eor w5,w5,w19
+ ror w20,w20,#2
+ add w22,w22,w4 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ eor w6,w6,w14
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ eor w6,w6,w3
+ ror w24,w24,#2
+ add w21,w21,w5 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ eor w7,w7,w15
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ eor w7,w7,w4
+ ror w23,w23,#2
+ add w20,w20,w6 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ movz w28,#0xeba1
+ movk w28,#0x6ed9,lsl#16
+ eor w8,w8,w10
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w8,w8,w16
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w8,w8,w5
+ ror w22,w22,#2
+ add w24,w24,w7 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w9,w9,w6
+ add w23,w23,w8 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w10,w10,w7
+ add w22,w22,w9 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w11,w11,w8
+ add w21,w21,w10 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w12,w12,w9
+ add w20,w20,w11 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w13,w13,w10
+ add w24,w24,w12 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w14,w14,w11
+ add w23,w23,w13 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w15,w15,w12
+ add w22,w22,w14 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w16,w16,w13
+ add w21,w21,w15 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w17,w17,w14
+ add w20,w20,w16 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w19,w19,w15
+ add w24,w24,w17 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w3,w3,w16
+ add w23,w23,w19 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w4,w4,w17
+ add w22,w22,w3 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w5,w5,w19
+ add w21,w21,w4 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w6,w6,w3
+ add w20,w20,w5 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w7,w7,w4
+ add w24,w24,w6 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w8,w8,w5
+ add w23,w23,w7 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w9,w9,w6
+ add w22,w22,w8 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w10,w10,w7
+ add w21,w21,w9 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w11,w11,w8
+ add w20,w20,w10 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ movz w28,#0xbcdc
+ movk w28,#0x8f1b,lsl#16
+ eor w12,w12,w14
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w12,w12,w9
+ add w24,w24,w11 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w13,w13,w15
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w13,w13,w5
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w13,w13,w10
+ add w23,w23,w12 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w14,w14,w16
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w14,w14,w6
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w14,w14,w11
+ add w22,w22,w13 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w15,w15,w17
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w15,w15,w7
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w15,w15,w12
+ add w21,w21,w14 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w16,w16,w19
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w16,w16,w8
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w16,w16,w13
+ add w20,w20,w15 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w17,w17,w3
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w17,w17,w9
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w17,w17,w14
+ add w24,w24,w16 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w19,w19,w4
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w19,w19,w10
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w19,w19,w15
+ add w23,w23,w17 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w3,w3,w5
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w3,w3,w11
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w3,w3,w16
+ add w22,w22,w19 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w4,w4,w6
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w4,w4,w12
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w4,w4,w17
+ add w21,w21,w3 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w5,w5,w7
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w5,w5,w13
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w5,w5,w19
+ add w20,w20,w4 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w6,w6,w8
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w6,w6,w14
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w6,w6,w3
+ add w24,w24,w5 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w7,w7,w9
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w7,w7,w15
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w7,w7,w4
+ add w23,w23,w6 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w8,w8,w10
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w8,w8,w16
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w8,w8,w5
+ add w22,w22,w7 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w9,w9,w11
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w9,w9,w17
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w9,w9,w6
+ add w21,w21,w8 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w10,w10,w12
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w10,w10,w19
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w10,w10,w7
+ add w20,w20,w9 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w11,w11,w13
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w11,w11,w3
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w11,w11,w8
+ add w24,w24,w10 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w12,w12,w14
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w12,w12,w4
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w12,w12,w9
+ add w23,w23,w11 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w13,w13,w15
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w13,w13,w5
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w13,w13,w10
+ add w22,w22,w12 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w14,w14,w16
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w14,w14,w6
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w14,w14,w11
+ add w21,w21,w13 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w15,w15,w17
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w15,w15,w7
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w15,w15,w12
+ add w20,w20,w14 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ movz w28,#0xc1d6
+ movk w28,#0xca62,lsl#16
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w16,w16,w19
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w16,w16,w8
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w16,w16,w13
+ add w24,w24,w15 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w17,w17,w14
+ add w23,w23,w16 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w19,w19,w15
+ add w22,w22,w17 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w3,w3,w16
+ add w21,w21,w19 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w4,w4,w17
+ add w20,w20,w3 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w5,w5,w19
+ add w24,w24,w4 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w6,w6,w3
+ add w23,w23,w5 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w7,w7,w4
+ add w22,w22,w6 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w8,w8,w5
+ add w21,w21,w7 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w9,w9,w6
+ add w20,w20,w8 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w10,w10,w7
+ add w24,w24,w9 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w11,w11,w8
+ add w23,w23,w10 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w12,w12,w9
+ add w22,w22,w11 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w13,w13,w10
+ add w21,w21,w12 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w14,w14,w11
+ add w20,w20,w13 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w15,w15,w12
+ add w24,w24,w14 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w16,w16,w13
+ add w23,w23,w15 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w17,w17,w14
+ add w22,w22,w16 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w19,w19,w15
+ add w21,w21,w17 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ ldp w4,w5,[x0]
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w19 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ldp w6,w7,[x0,#8]
+ eor w25,w24,w22
+ ror w27,w21,#27
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ ldr w8,[x0,#16]
+ add w20,w20,w25 // e+=F(b,c,d)
+ add w21,w21,w5
+ add w22,w22,w6
+ add w20,w20,w4
+ add w23,w23,w7
+ add w24,w24,w8
+ stp w20,w21,[x0]
+ stp w22,w23,[x0,#8]
+ str w24,[x0,#16]
+ cbnz x2,Loop
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldr x29,[sp],#96
+ ret
+
+.globl _sha1_block_data_order_hw
+.private_extern _sha1_block_data_order_hw
+
+.align 6
+_sha1_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ adrp x4,Lconst@PAGE
+ add x4,x4,Lconst@PAGEOFF
+ eor v1.16b,v1.16b,v1.16b
+ ld1 {v0.4s},[x0],#16
+ ld1 {v1.s}[0],[x0]
+ sub x0,x0,#16
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+
+ add v20.4s,v16.4s,v4.4s
+ rev32 v6.16b,v6.16b
+ orr v22.16b,v0.16b,v0.16b // offload
+
+ add v21.4s,v16.4s,v5.4s
+ rev32 v7.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b
+.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0
+ add v20.4s,v16.4s,v6.4s
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 1
+.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v16.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 2
+.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v16.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 3
+.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 4
+.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 5
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 6
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 7
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 8
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 9
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 10
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 11
+.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 12
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 13
+.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 14
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 15
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 16
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 17
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 18
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 19
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+
+ add v1.4s,v1.4s,v2.4s
+ add v0.4s,v0.4s,v22.4s
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.4s},[x0],#16
+ st1 {v1.s}[0],[x0]
+
+ ldr x29,[sp],#16
+ ret
+
+.section __TEXT,__const
+.align 6
+Lconst:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/sha1-armv8-linux.S b/gen/bcm/sha1-armv8-linux.S
new file mode 100644
index 0000000..f2df2dd
--- /dev/null
+++ b/gen/bcm/sha1-armv8-linux.S
@@ -0,0 +1,1218 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl sha1_block_data_order_nohw
+.hidden sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,%function
+.align 6
+sha1_block_data_order_nohw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ ldp w20,w21,[x0]
+ ldp w22,w23,[x0,#8]
+ ldr w24,[x0,#16]
+
+.Loop:
+ ldr x3,[x1],#64
+ movz w28,#0x7999
+ sub x2,x2,#1
+ movk w28,#0x5a82,lsl#16
+#ifdef __AARCH64EB__
+ ror x3,x3,#32
+#else
+ rev32 x3,x3
+#endif
+ add w24,w24,w28 // warm it up
+ add w24,w24,w3
+ lsr x4,x3,#32
+ ldr x5,[x1,#-56]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w4 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x5,x5,#32
+#else
+ rev32 x5,x5
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w5 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x6,x5,#32
+ ldr x7,[x1,#-48]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w6 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x7,x7,#32
+#else
+ rev32 x7,x7
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w7 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x8,x7,#32
+ ldr x9,[x1,#-40]
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w8 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x9,x9,#32
+#else
+ rev32 x9,x9
+#endif
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w9 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ lsr x10,x9,#32
+ ldr x11,[x1,#-32]
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w10 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x11,x11,#32
+#else
+ rev32 x11,x11
+#endif
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w11 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ lsr x12,x11,#32
+ ldr x13,[x1,#-24]
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w12 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x13,x13,#32
+#else
+ rev32 x13,x13
+#endif
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w13 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ lsr x14,x13,#32
+ ldr x15,[x1,#-16]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w14 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x15,x15,#32
+#else
+ rev32 x15,x15
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w15 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x16,x15,#32
+ ldr x17,[x1,#-8]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w16 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x17,x17,#32
+#else
+ rev32 x17,x17
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w17 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x19,x17,#32
+ eor w3,w3,w5
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w3,w3,w11
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w3,w3,w16
+ ror w22,w22,#2
+ add w24,w24,w19 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ eor w4,w4,w12
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ eor w4,w4,w17
+ ror w21,w21,#2
+ add w23,w23,w3 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ eor w5,w5,w13
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ eor w5,w5,w19
+ ror w20,w20,#2
+ add w22,w22,w4 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ eor w6,w6,w14
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ eor w6,w6,w3
+ ror w24,w24,#2
+ add w21,w21,w5 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ eor w7,w7,w15
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ eor w7,w7,w4
+ ror w23,w23,#2
+ add w20,w20,w6 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ movz w28,#0xeba1
+ movk w28,#0x6ed9,lsl#16
+ eor w8,w8,w10
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w8,w8,w16
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w8,w8,w5
+ ror w22,w22,#2
+ add w24,w24,w7 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w9,w9,w6
+ add w23,w23,w8 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w10,w10,w7
+ add w22,w22,w9 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w11,w11,w8
+ add w21,w21,w10 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w12,w12,w9
+ add w20,w20,w11 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w13,w13,w10
+ add w24,w24,w12 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w14,w14,w11
+ add w23,w23,w13 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w15,w15,w12
+ add w22,w22,w14 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w16,w16,w13
+ add w21,w21,w15 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w17,w17,w14
+ add w20,w20,w16 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w19,w19,w15
+ add w24,w24,w17 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w3,w3,w16
+ add w23,w23,w19 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w4,w4,w17
+ add w22,w22,w3 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w5,w5,w19
+ add w21,w21,w4 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w6,w6,w3
+ add w20,w20,w5 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w7,w7,w4
+ add w24,w24,w6 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w8,w8,w5
+ add w23,w23,w7 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w9,w9,w6
+ add w22,w22,w8 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w10,w10,w7
+ add w21,w21,w9 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w11,w11,w8
+ add w20,w20,w10 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ movz w28,#0xbcdc
+ movk w28,#0x8f1b,lsl#16
+ eor w12,w12,w14
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w12,w12,w9
+ add w24,w24,w11 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w13,w13,w15
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w13,w13,w5
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w13,w13,w10
+ add w23,w23,w12 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w14,w14,w16
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w14,w14,w6
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w14,w14,w11
+ add w22,w22,w13 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w15,w15,w17
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w15,w15,w7
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w15,w15,w12
+ add w21,w21,w14 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w16,w16,w19
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w16,w16,w8
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w16,w16,w13
+ add w20,w20,w15 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w17,w17,w3
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w17,w17,w9
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w17,w17,w14
+ add w24,w24,w16 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w19,w19,w4
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w19,w19,w10
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w19,w19,w15
+ add w23,w23,w17 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w3,w3,w5
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w3,w3,w11
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w3,w3,w16
+ add w22,w22,w19 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w4,w4,w6
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w4,w4,w12
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w4,w4,w17
+ add w21,w21,w3 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w5,w5,w7
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w5,w5,w13
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w5,w5,w19
+ add w20,w20,w4 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w6,w6,w8
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w6,w6,w14
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w6,w6,w3
+ add w24,w24,w5 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w7,w7,w9
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w7,w7,w15
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w7,w7,w4
+ add w23,w23,w6 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w8,w8,w10
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w8,w8,w16
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w8,w8,w5
+ add w22,w22,w7 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w9,w9,w11
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w9,w9,w17
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w9,w9,w6
+ add w21,w21,w8 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w10,w10,w12
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w10,w10,w19
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w10,w10,w7
+ add w20,w20,w9 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w11,w11,w13
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w11,w11,w3
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w11,w11,w8
+ add w24,w24,w10 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w12,w12,w14
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w12,w12,w4
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w12,w12,w9
+ add w23,w23,w11 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w13,w13,w15
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w13,w13,w5
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w13,w13,w10
+ add w22,w22,w12 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w14,w14,w16
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w14,w14,w6
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w14,w14,w11
+ add w21,w21,w13 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w15,w15,w17
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w15,w15,w7
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w15,w15,w12
+ add w20,w20,w14 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ movz w28,#0xc1d6
+ movk w28,#0xca62,lsl#16
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w16,w16,w19
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w16,w16,w8
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w16,w16,w13
+ add w24,w24,w15 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w17,w17,w14
+ add w23,w23,w16 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w19,w19,w15
+ add w22,w22,w17 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w3,w3,w16
+ add w21,w21,w19 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w4,w4,w17
+ add w20,w20,w3 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w5,w5,w19
+ add w24,w24,w4 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w6,w6,w3
+ add w23,w23,w5 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w7,w7,w4
+ add w22,w22,w6 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w8,w8,w5
+ add w21,w21,w7 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w9,w9,w6
+ add w20,w20,w8 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w10,w10,w7
+ add w24,w24,w9 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w11,w11,w8
+ add w23,w23,w10 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w12,w12,w9
+ add w22,w22,w11 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w13,w13,w10
+ add w21,w21,w12 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w14,w14,w11
+ add w20,w20,w13 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w15,w15,w12
+ add w24,w24,w14 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w16,w16,w13
+ add w23,w23,w15 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w17,w17,w14
+ add w22,w22,w16 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w19,w19,w15
+ add w21,w21,w17 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ ldp w4,w5,[x0]
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w19 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ldp w6,w7,[x0,#8]
+ eor w25,w24,w22
+ ror w27,w21,#27
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ ldr w8,[x0,#16]
+ add w20,w20,w25 // e+=F(b,c,d)
+ add w21,w21,w5
+ add w22,w22,w6
+ add w20,w20,w4
+ add w23,w23,w7
+ add w24,w24,w8
+ stp w20,w21,[x0]
+ stp w22,w23,[x0,#8]
+ str w24,[x0,#16]
+ cbnz x2,.Loop
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldr x29,[sp],#96
+ ret
+.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+.globl sha1_block_data_order_hw
+.hidden sha1_block_data_order_hw
+.type sha1_block_data_order_hw,%function
+.align 6
+sha1_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ adrp x4,.Lconst
+ add x4,x4,:lo12:.Lconst
+ eor v1.16b,v1.16b,v1.16b
+ ld1 {v0.4s},[x0],#16
+ ld1 {v1.s}[0],[x0]
+ sub x0,x0,#16
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+.Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+
+ add v20.4s,v16.4s,v4.4s
+ rev32 v6.16b,v6.16b
+ orr v22.16b,v0.16b,v0.16b // offload
+
+ add v21.4s,v16.4s,v5.4s
+ rev32 v7.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b
+.inst 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0
+ add v20.4s,v16.4s,v6.4s
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 1
+.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v16.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 2
+.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v16.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 3
+.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 4
+.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 5
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 6
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 7
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 8
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 9
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 10
+.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 11
+.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 12
+.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 13
+.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 14
+.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 15
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 16
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 17
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 18
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 19
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+
+ add v1.4s,v1.4s,v2.4s
+ add v0.4s,v0.4s,v22.4s
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.4s},[x0],#16
+ st1 {v1.s}[0],[x0]
+
+ ldr x29,[sp],#16
+ ret
+.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
+.section .rodata
+.align 6
+.Lconst:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/sha1-armv8-win.S b/gen/bcm/sha1-armv8-win.S
new file mode 100644
index 0000000..f8c8b86
--- /dev/null
+++ b/gen/bcm/sha1-armv8-win.S
@@ -0,0 +1,1222 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl sha1_block_data_order_nohw
+
+.def sha1_block_data_order_nohw
+ .type 32
+.endef
+.align 6
+sha1_block_data_order_nohw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ ldp w20,w21,[x0]
+ ldp w22,w23,[x0,#8]
+ ldr w24,[x0,#16]
+
+Loop:
+ ldr x3,[x1],#64
+ movz w28,#0x7999
+ sub x2,x2,#1
+ movk w28,#0x5a82,lsl#16
+#ifdef __AARCH64EB__
+ ror x3,x3,#32
+#else
+ rev32 x3,x3
+#endif
+ add w24,w24,w28 // warm it up
+ add w24,w24,w3
+ lsr x4,x3,#32
+ ldr x5,[x1,#-56]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w4 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x5,x5,#32
+#else
+ rev32 x5,x5
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w5 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x6,x5,#32
+ ldr x7,[x1,#-48]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w6 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x7,x7,#32
+#else
+ rev32 x7,x7
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w7 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x8,x7,#32
+ ldr x9,[x1,#-40]
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w8 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x9,x9,#32
+#else
+ rev32 x9,x9
+#endif
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w9 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ lsr x10,x9,#32
+ ldr x11,[x1,#-32]
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w10 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x11,x11,#32
+#else
+ rev32 x11,x11
+#endif
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w11 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ lsr x12,x11,#32
+ ldr x13,[x1,#-24]
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w12 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x13,x13,#32
+#else
+ rev32 x13,x13
+#endif
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w13 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ lsr x14,x13,#32
+ ldr x15,[x1,#-16]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w14 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x15,x15,#32
+#else
+ rev32 x15,x15
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w15 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x16,x15,#32
+ ldr x17,[x1,#-8]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w16 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x17,x17,#32
+#else
+ rev32 x17,x17
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w17 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x19,x17,#32
+ eor w3,w3,w5
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w3,w3,w11
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w3,w3,w16
+ ror w22,w22,#2
+ add w24,w24,w19 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ eor w4,w4,w12
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ eor w4,w4,w17
+ ror w21,w21,#2
+ add w23,w23,w3 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ eor w5,w5,w13
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ eor w5,w5,w19
+ ror w20,w20,#2
+ add w22,w22,w4 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ eor w6,w6,w14
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ eor w6,w6,w3
+ ror w24,w24,#2
+ add w21,w21,w5 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ eor w7,w7,w15
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ eor w7,w7,w4
+ ror w23,w23,#2
+ add w20,w20,w6 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ movz w28,#0xeba1
+ movk w28,#0x6ed9,lsl#16
+ eor w8,w8,w10
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w8,w8,w16
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w8,w8,w5
+ ror w22,w22,#2
+ add w24,w24,w7 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w9,w9,w6
+ add w23,w23,w8 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w10,w10,w7
+ add w22,w22,w9 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w11,w11,w8
+ add w21,w21,w10 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w12,w12,w9
+ add w20,w20,w11 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w13,w13,w10
+ add w24,w24,w12 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w14,w14,w11
+ add w23,w23,w13 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w15,w15,w12
+ add w22,w22,w14 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w16,w16,w13
+ add w21,w21,w15 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w17,w17,w14
+ add w20,w20,w16 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w19,w19,w15
+ add w24,w24,w17 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w3,w3,w16
+ add w23,w23,w19 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w4,w4,w17
+ add w22,w22,w3 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w5,w5,w19
+ add w21,w21,w4 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w6,w6,w3
+ add w20,w20,w5 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w7,w7,w4
+ add w24,w24,w6 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w8,w8,w5
+ add w23,w23,w7 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w9,w9,w6
+ add w22,w22,w8 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w10,w10,w7
+ add w21,w21,w9 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w11,w11,w8
+ add w20,w20,w10 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ movz w28,#0xbcdc
+ movk w28,#0x8f1b,lsl#16
+ eor w12,w12,w14
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w12,w12,w9
+ add w24,w24,w11 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w13,w13,w15
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w13,w13,w5
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w13,w13,w10
+ add w23,w23,w12 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w14,w14,w16
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w14,w14,w6
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w14,w14,w11
+ add w22,w22,w13 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w15,w15,w17
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w15,w15,w7
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w15,w15,w12
+ add w21,w21,w14 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w16,w16,w19
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w16,w16,w8
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w16,w16,w13
+ add w20,w20,w15 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w17,w17,w3
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w17,w17,w9
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w17,w17,w14
+ add w24,w24,w16 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w19,w19,w4
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w19,w19,w10
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w19,w19,w15
+ add w23,w23,w17 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w3,w3,w5
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w3,w3,w11
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w3,w3,w16
+ add w22,w22,w19 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w4,w4,w6
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w4,w4,w12
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w4,w4,w17
+ add w21,w21,w3 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w5,w5,w7
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w5,w5,w13
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w5,w5,w19
+ add w20,w20,w4 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w6,w6,w8
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w6,w6,w14
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w6,w6,w3
+ add w24,w24,w5 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w7,w7,w9
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w7,w7,w15
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w7,w7,w4
+ add w23,w23,w6 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w8,w8,w10
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w8,w8,w16
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w8,w8,w5
+ add w22,w22,w7 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w9,w9,w11
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w9,w9,w17
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w9,w9,w6
+ add w21,w21,w8 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w10,w10,w12
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w10,w10,w19
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w10,w10,w7
+ add w20,w20,w9 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w11,w11,w13
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w11,w11,w3
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w11,w11,w8
+ add w24,w24,w10 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w12,w12,w14
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w12,w12,w4
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w12,w12,w9
+ add w23,w23,w11 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w13,w13,w15
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w13,w13,w5
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w13,w13,w10
+ add w22,w22,w12 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w14,w14,w16
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w14,w14,w6
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w14,w14,w11
+ add w21,w21,w13 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w15,w15,w17
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w15,w15,w7
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w15,w15,w12
+ add w20,w20,w14 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ movz w28,#0xc1d6
+ movk w28,#0xca62,lsl#16
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w16,w16,w19
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w16,w16,w8
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w16,w16,w13
+ add w24,w24,w15 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w17,w17,w14
+ add w23,w23,w16 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w19,w19,w15
+ add w22,w22,w17 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w3,w3,w16
+ add w21,w21,w19 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w4,w4,w17
+ add w20,w20,w3 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w5,w5,w19
+ add w24,w24,w4 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w6,w6,w3
+ add w23,w23,w5 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w7,w7,w4
+ add w22,w22,w6 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w8,w8,w5
+ add w21,w21,w7 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w9,w9,w6
+ add w20,w20,w8 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w10,w10,w7
+ add w24,w24,w9 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w11,w11,w8
+ add w23,w23,w10 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w12,w12,w9
+ add w22,w22,w11 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w13,w13,w10
+ add w21,w21,w12 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w14,w14,w11
+ add w20,w20,w13 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w15,w15,w12
+ add w24,w24,w14 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w16,w16,w13
+ add w23,w23,w15 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w17,w17,w14
+ add w22,w22,w16 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w19,w19,w15
+ add w21,w21,w17 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ ldp w4,w5,[x0]
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w19 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ldp w6,w7,[x0,#8]
+ eor w25,w24,w22
+ ror w27,w21,#27
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ ldr w8,[x0,#16]
+ add w20,w20,w25 // e+=F(b,c,d)
+ add w21,w21,w5
+ add w22,w22,w6
+ add w20,w20,w4
+ add w23,w23,w7
+ add w24,w24,w8
+ stp w20,w21,[x0]
+ stp w22,w23,[x0,#8]
+ str w24,[x0,#16]
+ cbnz x2,Loop
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldr x29,[sp],#96
+ ret
+
+.globl sha1_block_data_order_hw
+
+.def sha1_block_data_order_hw
+ .type 32
+.endef
+.align 6
+sha1_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ adrp x4,Lconst
+ add x4,x4,:lo12:Lconst
+ eor v1.16b,v1.16b,v1.16b
+ ld1 {v0.4s},[x0],#16
+ ld1 {v1.s}[0],[x0]
+ sub x0,x0,#16
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+
+ add v20.4s,v16.4s,v4.4s
+ rev32 v6.16b,v6.16b
+ orr v22.16b,v0.16b,v0.16b // offload
+
+ add v21.4s,v16.4s,v5.4s
+ rev32 v7.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b
+.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0
+ add v20.4s,v16.4s,v6.4s
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 1
+.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v16.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 2
+.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v16.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 3
+.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 4
+.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 5
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 6
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 7
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 8
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 9
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 10
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 11
+.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 12
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 13
+.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 14
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 15
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 16
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 17
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 18
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 19
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+
+ add v1.4s,v1.4s,v2.4s
+ add v0.4s,v0.4s,v22.4s
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.4s},[x0],#16
+ st1 {v1.s}[0],[x0]
+
+ ldr x29,[sp],#16
+ ret
+
+.section .rodata
+.align 6
+Lconst:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/sha1-x86_64-apple.S b/gen/bcm/sha1-x86_64-apple.S
new file mode 100644
index 0000000..a1ea1e6
--- /dev/null
+++ b/gen/bcm/sha1-x86_64-apple.S
@@ -0,0 +1,5450 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+.globl _sha1_block_data_order_nohw
+.private_extern _sha1_block_data_order_nohw
+
+.p2align 4
+_sha1_block_data_order_nohw:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ movq %rdi,%r8
+ subq $72,%rsp
+ movq %rsi,%r9
+ andq $-64,%rsp
+ movq %rdx,%r10
+ movq %rax,64(%rsp)
+
+L$prologue:
+
+ movl 0(%r8),%esi
+ movl 4(%r8),%edi
+ movl 8(%r8),%r11d
+ movl 12(%r8),%r12d
+ movl 16(%r8),%r13d
+ jmp L$loop
+
+.p2align 4
+L$loop:
+ movl 0(%r9),%edx
+ bswapl %edx
+ movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ecx
+ bswapl %ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ecx
+ bswapl %r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ecx
+ bswapl %edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ecx
+ bswapl %ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ecx
+ bswapl %r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ecx
+ bswapl %edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%r14,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ecx
+ bswapl %ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rdx,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ecx
+ bswapl %r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rbp,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ecx
+ bswapl %edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%r14,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ecx
+ bswapl %ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rdx,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ecx
+ bswapl %r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rbp,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ecx
+ bswapl %edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%r14,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %r12d,%ecx
+ bswapl %ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rdx,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r11d,%ecx
+ bswapl %r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rbp,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %edi,%ecx
+ bswapl %edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%r14,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %esi,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ roll $1,%ebp
+ addl %eax,%r13d
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %r13d,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ roll $1,%r14d
+ addl %eax,%r12d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %r12d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ roll $1,%edx
+ addl %eax,%r11d
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r11d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ roll $30,%r12d
+ xorl %esi,%eax
+ addl %ecx,%edi
+ roll $1,%ebp
+ addl %eax,%edi
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %edi,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ roll $30,%r11d
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ roll $1,%r14d
+ addl %eax,%esi
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
+ movl %esi,%ecx
+ xorl 28(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
+ movl %r13d,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
+ movl %r12d,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
+ movl %r11d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,32(%rsp)
+ movl %edi,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,36(%rsp)
+ movl %esi,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal 1859775393(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
+ movl %r13d,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
+ movl %r12d,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
+ movl %r11d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
+ movl %edi,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,56(%rsp)
+ movl %esi,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,60(%rsp)
+ movl %r13d,%ecx
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%r14d
+ leal 1859775393(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
+ movl %r12d,%ecx
+ xorl 12(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
+ movl %r11d,%ecx
+ xorl 16(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
+ movl %edi,%ecx
+ xorl 20(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
+ movl %esi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,16(%rsp)
+ movl %r13d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,20(%rsp)
+ movl %r12d,%ecx
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
+ movl %r11d,%ecx
+ xorl 36(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
+ movl %edi,%ecx
+ xorl 40(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
+ xorl 48(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 8(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,40(%rsp)
+ movl %edi,%ebx
+ xorl 52(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 12(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,44(%rsp)
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 16(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%ebp
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%r14d
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
+ xorl 8(%rsp),%edx
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 32(%rsp),%edx
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%edx
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ebx
+ xorl 12(%rsp),%ebp
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 36(%rsp),%ebp
+ leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%ebp
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 40(%rsp),%r14d
+ leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%r14d
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%edx
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%ebp
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%r14d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
+ xorl 32(%rsp),%edx
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 56(%rsp),%edx
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%edx
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ebx
+ xorl 36(%rsp),%ebp
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 60(%rsp),%ebp
+ leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%ebp
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 0(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
+ xorl 56(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 16(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %esi,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r13d,%ecx
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %r12d,%ecx
+ xorl 4(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %r11d,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %edi,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %esi,%ecx
+ xorl 16(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r13d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %r12d,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ leal -899497514(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
+ movl %r11d,%ecx
+ xorl 28(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
+ movl %edi,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
+ movl %esi,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
+ movl %r13d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal -899497514(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
+ movl %edi,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
+ movl %esi,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
+ movl %r13d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 0(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %r11d,%eax
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ leal -899497514(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ addl 0(%r8),%esi
+ addl 4(%r8),%edi
+ addl 8(%r8),%r11d
+ addl 12(%r8),%r12d
+ addl 16(%r8),%r13d
+ movl %esi,0(%r8)
+ movl %edi,4(%r8)
+ movl %r11d,8(%r8)
+ movl %r12d,12(%r8)
+ movl %r13d,16(%r8)
+
+ subq $1,%r10
+ leaq 64(%r9),%r9
+ jnz L$loop
+
+ movq 64(%rsp),%rsi
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$epilogue:
+ ret
+
+
+.globl _sha1_block_data_order_hw
+.private_extern _sha1_block_data_order_hw
+
+.p2align 5
+_sha1_block_data_order_hw:
+
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp L$oop_shaext
+
+.p2align 4
+L$oop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%r8
+ paddd %xmm4,%xmm1
+ cmovneq %r8,%rsi
+ prefetcht0 512(%rsi)
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz L$oop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+ ret
+
+
+.globl _sha1_block_data_order_ssse3
+.private_extern _sha1_block_data_order_ssse3
+
+.p2align 4
+_sha1_block_data_order_ssse3:
+
+_CET_ENDBR
+ movq %rsp,%r11
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ leaq -64(%rsp),%rsp
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ addq $64,%r9
+ paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ jmp L$oop_ssse3
+.p2align 4
+L$oop_ssse3:
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
+ movdqa %xmm3,%xmm8
+ paddd %xmm3,%xmm9
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ psrldq $4,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm2,%xmm8
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ pxor %xmm8,%xmm4
+ xorl %ebx,%eax
+ roll $5,%ebp
+ movdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ psrld $31,%xmm8
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm8,%xmm4
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ xorl %ebp,%edx
+ movdqa -64(%r14),%xmm10
+ roll $5,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
+ movdqa %xmm4,%xmm9
+ paddd %xmm4,%xmm10
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm9
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ pxor %xmm9,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm10,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ psrld $31,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
+ psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
+ por %xmm9,%xmm5
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ xorl %eax,%ebp
+ movdqa -32(%r14),%xmm8
+ roll $5,%edx
+ addl %edi,%ecx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
+ movdqa %xmm5,%xmm10
+ paddd %xmm5,%xmm8
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm10
+ andl %edx,%edi
+ xorl %ebp,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm10
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ pxor %xmm10,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm8,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ psrld $31,%xmm10
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
+ por %xmm10,%xmm6
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ xorl %ebx,%eax
+ movdqa -32(%r14),%xmm9
+ roll $5,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
+ movdqa %xmm6,%xmm8
+ paddd %xmm6,%xmm9
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm8
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm8
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ pxor %xmm8,%xmm7
+ xorl %ebp,%edx
+ roll $5,%ecx
+ movdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ psrld $31,%xmm8
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm8,%xmm7
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ xorl %ecx,%ebx
+ movdqa -32(%r14),%xmm10
+ roll $5,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%edi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
+ paddd %xmm7,%xmm10
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ movdqa %xmm0,%xmm9
+ xorl %eax,%ebp
+ roll $5,%edx
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ pslld $2,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ psrld $30,%xmm9
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ por %xmm9,%xmm0
+ xorl %ebp,%edx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebx
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm8,0(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 24(%rsp),%ecx
+ pslld $2,%xmm1
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm10
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm10,%xmm1
+ addl %edx,%ecx
+ addl 28(%rsp),%ebx
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r14),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ movdqa %xmm9,16(%rsp)
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 40(%rsp),%edx
+ pslld $2,%xmm2
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ psrld $30,%xmm8
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ por %xmm8,%xmm2
+ addl %ebp,%edx
+ addl 44(%rsp),%ecx
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ movdqa %xmm10,%xmm8
+ rorl $7,%edx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
+ movdqa %xmm10,32(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 56(%rsp),%ebp
+ pslld $2,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%edi
+ psrld $30,%xmm9
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ por %xmm9,%xmm3
+ addl %eax,%ebp
+ addl 60(%rsp),%edx
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebp
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ movdqa %xmm8,48(%rsp)
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 8(%rsp),%eax
+ pslld $2,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%edi
+ psrld $30,%xmm10
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ por %xmm10,%xmm4
+ addl %ebx,%eax
+ addl 12(%rsp),%ebp
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%eax
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
+ movdqa %xmm9,0(%rsp)
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 24(%rsp),%ebx
+ pslld $2,%xmm5
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ psrld $30,%xmm8
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ por %xmm8,%xmm5
+ addl %ecx,%ebx
+ addl 28(%rsp),%eax
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ punpcklqdq %xmm5,%xmm9
+ movl %eax,%edi
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
+ xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movdqa %xmm6,%xmm9
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ pslld $2,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ psrld $30,%xmm9
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
+ rorl $7,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ pxor %xmm3,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ punpcklqdq %xmm6,%xmm10
+ movl %ebx,%edi
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa 32(%r14),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
+ xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movdqa %xmm7,%xmm10
+ movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ pslld $2,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ psrld $30,%xmm10
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
+ rorl $7,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ pxor %xmm4,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ punpcklqdq %xmm7,%xmm8
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
+ xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movdqa %xmm0,%xmm8
+ movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ pslld $2,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ psrld $30,%xmm8
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
+ rorl $7,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ punpcklqdq %xmm0,%xmm9
+ movl %edx,%edi
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
+ xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movdqa %xmm1,%xmm9
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ pslld $2,%xmm1
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ psrld $30,%xmm9
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
+ rorl $7,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ punpcklqdq %xmm1,%xmm10
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
+ xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movdqa %xmm2,%xmm10
+ movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ pslld $2,%xmm2
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ psrld $30,%xmm10
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
+ rorl $7,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%ebx
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm9,32(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 56(%rsp),%ecx
+ pslld $2,%xmm3
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm8
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm8,%xmm3
+ addl %edx,%ecx
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa %xmm10,48(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je L$done_ssse3
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+.byte 102,15,56,0,206
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ paddd %xmm9,%xmm0
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+.byte 102,15,56,0,214
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ paddd %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+.byte 102,15,56,0,222
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ paddd %xmm9,%xmm2
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp L$oop_ssse3
+
+.p2align 4
+L$done_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+
+ movq -32(%r11),%r13
+
+ movq -24(%r11),%r12
+
+ movq -16(%r11),%rbp
+
+ movq -8(%r11),%rbx
+
+ leaq (%r11),%rsp
+
+L$epilogue_ssse3:
+ ret
+
+
+.globl _sha1_block_data_order_avx
+.private_extern _sha1_block_data_order_avx
+
+.p2align 4
+_sha1_block_data_order_avx:
+
+_CET_ENDBR
+ movq %rsp,%r11
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp L$oop_avx
+.p2align 4
+L$oop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r14),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r14),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r14),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je L$done_avx
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp L$oop_avx
+
+.p2align 4
+L$done_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+
+ movq -32(%r11),%r13
+
+ movq -24(%r11),%r12
+
+ movq -16(%r11),%rbp
+
+ movq -8(%r11),%rbx
+
+ leaq (%r11),%rsp
+
+L$epilogue_avx:
+ ret
+
+
+.globl _sha1_block_data_order_avx2
+.private_extern _sha1_block_data_order_avx2
+
+.p2align 4
+_sha1_block_data_order_avx2:
+
+_CET_ENDBR
+ movq %rsp,%r11
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ vzeroupper
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ leaq -640(%rsp),%rsp
+ shlq $6,%r10
+ leaq 64(%r9),%r13
+ andq $-128,%rsp
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+ movl 4(%r8),%ebp
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl 16(%r8),%esi
+ vmovdqu 64(%r14),%ymm6
+
+ vmovdqu (%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ leaq 64(%r9),%r9
+ vinserti128 $1,(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vpshufb %ymm6,%ymm0,%ymm0
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vpshufb %ymm6,%ymm1,%ymm1
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ vpshufb %ymm6,%ymm2,%ymm2
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm3,%ymm3
+
+ vpaddd %ymm11,%ymm0,%ymm4
+ vpaddd %ymm11,%ymm1,%ymm5
+ vmovdqu %ymm4,0(%rsp)
+ vpaddd %ymm11,%ymm2,%ymm6
+ vmovdqu %ymm5,32(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ vmovdqu %ymm6,64(%rsp)
+ vmovdqu %ymm7,96(%rsp)
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ vpsrldq $4,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $31,%ymm4,%ymm8
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ vpxor %ymm10,%ymm4,%ymm4
+ vpaddd %ymm11,%ymm4,%ymm9
+ vmovdqu %ymm9,128(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ vpsrldq $4,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm5,%ymm9
+ vmovdqu %ymm9,160(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ vpsrldq $4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $31,%ymm6,%ymm8
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ vpxor %ymm10,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm6,%ymm9
+ vmovdqu %ymm9,192(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ vpsrldq $4,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm7,%ymm8
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ vpxor %ymm10,%ymm7,%ymm7
+ vpaddd %ymm11,%ymm7,%ymm9
+ vmovdqu %ymm9,224(%rsp)
+ leaq 128(%rsp),%r13
+ jmp L$oop_avx2
+.p2align 5
+L$oop_avx2:
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ jmp L$align32_1
+.p2align 5
+L$align32_1:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpxor %ymm1,%ymm0,%ymm0
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vpor %ymm8,%ymm0,%ymm0
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ vmovdqu %ymm9,256(%rsp)
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpxor %ymm2,%ymm1,%ymm1
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vpor %ymm8,%ymm1,%ymm1
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vmovdqu %ymm9,288(%rsp)
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ vpxor %ymm3,%ymm2,%ymm2
+ vmovdqu 0(%r14),%ymm11
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vpor %ymm8,%ymm2,%ymm2
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vmovdqu %ymm9,320(%rsp)
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ vpxor %ymm4,%ymm3,%ymm3
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ vpor %ymm8,%ymm3,%ymm3
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vmovdqu %ymm9,352(%rsp)
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ vpsrld $30,%ymm4,%ymm8
+ vpslld $2,%ymm4,%ymm4
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpor %ymm8,%ymm4,%ymm4
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpaddd %ymm11,%ymm4,%ymm9
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ vmovdqu %ymm9,384(%rsp)
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm6,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpxor %ymm8,%ymm5,%ymm5
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ vpsrld $30,%ymm5,%ymm8
+ vpslld $2,%ymm5,%ymm5
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vpor %ymm8,%ymm5,%ymm5
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ vmovdqu %ymm9,416(%rsp)
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ vpxor %ymm8,%ymm6,%ymm6
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ vpsrld $30,%ymm6,%ymm8
+ vpslld $2,%ymm6,%ymm6
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpor %ymm8,%ymm6,%ymm6
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ vmovdqu %ymm9,448(%rsp)
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm0,%ymm7,%ymm7
+ vmovdqu 32(%r14),%ymm11
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpxor %ymm8,%ymm7,%ymm7
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ vpsrld $30,%ymm7,%ymm8
+ vpslld $2,%ymm7,%ymm7
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpor %ymm8,%ymm7,%ymm7
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ vmovdqu %ymm9,480(%rsp)
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ jmp L$align32_2
+.p2align 5
+L$align32_2:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ vpxor %ymm1,%ymm0,%ymm0
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ vpor %ymm8,%ymm0,%ymm0
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ vmovdqu %ymm9,512(%rsp)
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm2,%ymm1,%ymm1
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ vpor %ymm8,%ymm1,%ymm1
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ vmovdqu %ymm9,544(%rsp)
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ vpxor %ymm3,%ymm2,%ymm2
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ vpor %ymm8,%ymm2,%ymm2
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ vmovdqu %ymm9,576(%rsp)
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ vpxor %ymm4,%ymm3,%ymm3
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ vpor %ymm8,%ymm3,%ymm3
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vmovdqu %ymm9,608(%rsp)
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%r9),%r13
+ leaq 128(%r9),%rdi
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ je L$done_avx2
+ vmovdqu 64(%r14),%ymm6
+ cmpq %r10,%rdi
+ ja L$ast_avx2
+
+ vmovdqu -64(%rdi),%xmm0
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 $1,0(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ jmp L$ast_avx2
+
+.p2align 5
+L$ast_avx2:
+ leaq 128+16(%rsp),%r13
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ subq $-128,%r9
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm0,%ymm0
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpshufb %ymm6,%ymm1,%ymm1
+ vpaddd %ymm11,%ymm0,%ymm8
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vmovdqu %ymm8,0(%rsp)
+ vpshufb %ymm6,%ymm2,%ymm2
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ vmovdqu %ymm9,32(%rsp)
+ vpshufb %ymm6,%ymm3,%ymm3
+ vpaddd %ymm11,%ymm2,%ymm6
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ jmp L$align32_3
+.p2align 5
+L$align32_3:
+ vmovdqu %ymm6,64(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vmovdqu %ymm7,96(%rsp)
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ vpsrldq $4,%ymm3,%ymm8
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ vpsrld $31,%ymm4,%ymm8
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm10,%ymm4,%ymm4
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpaddd %ymm11,%ymm4,%ymm9
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vmovdqu %ymm9,128(%rsp)
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrldq $4,%ymm4,%ymm8
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ vpxor %ymm10,%ymm5,%ymm5
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vmovdqu %ymm9,160(%rsp)
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpsrldq $4,%ymm5,%ymm8
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm8,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpsrld $31,%ymm6,%ymm8
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ vpxor %ymm10,%ymm6,%ymm6
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vmovdqu %ymm9,192(%rsp)
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpsrldq $4,%ymm6,%ymm8
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm8,%ymm7,%ymm7
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ vpsrld $31,%ymm7,%ymm8
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ vpxor %ymm10,%ymm7,%ymm7
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vmovdqu %ymm9,224(%rsp)
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%rsp),%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ jbe L$oop_avx2
+
+L$done_avx2:
+ vzeroupper
+ movq -40(%r11),%r14
+
+ movq -32(%r11),%r13
+
+ movq -24(%r11),%r12
+
+ movq -16(%r11),%rbp
+
+ movq -8(%r11),%rbx
+
+ leaq (%r11),%rsp
+
+L$epilogue_avx2:
+ ret
+
+
+.section __DATA,__const
+.p2align 6
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+.text
+#endif
diff --git a/gen/bcm/sha1-x86_64-linux.S b/gen/bcm/sha1-x86_64-linux.S
new file mode 100644
index 0000000..39d9ad3
--- /dev/null
+++ b/gen/bcm/sha1-x86_64-linux.S
@@ -0,0 +1,5450 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.globl sha1_block_data_order_nohw
+.hidden sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,@function
+.align 16
+sha1_block_data_order_nohw:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ movq %rdi,%r8
+ subq $72,%rsp
+ movq %rsi,%r9
+ andq $-64,%rsp
+ movq %rdx,%r10
+ movq %rax,64(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08
+.Lprologue:
+
+ movl 0(%r8),%esi
+ movl 4(%r8),%edi
+ movl 8(%r8),%r11d
+ movl 12(%r8),%r12d
+ movl 16(%r8),%r13d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movl 0(%r9),%edx
+ bswapl %edx
+ movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ecx
+ bswapl %ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ecx
+ bswapl %r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ecx
+ bswapl %edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ecx
+ bswapl %ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ecx
+ bswapl %r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ecx
+ bswapl %edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%r14,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ecx
+ bswapl %ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rdx,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ecx
+ bswapl %r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rbp,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ecx
+ bswapl %edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%r14,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ecx
+ bswapl %ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rdx,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ecx
+ bswapl %r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rbp,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ecx
+ bswapl %edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%r14,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %r12d,%ecx
+ bswapl %ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rdx,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r11d,%ecx
+ bswapl %r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rbp,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %edi,%ecx
+ bswapl %edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%r14,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %esi,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ roll $1,%ebp
+ addl %eax,%r13d
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %r13d,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ roll $1,%r14d
+ addl %eax,%r12d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %r12d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ roll $1,%edx
+ addl %eax,%r11d
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r11d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ roll $30,%r12d
+ xorl %esi,%eax
+ addl %ecx,%edi
+ roll $1,%ebp
+ addl %eax,%edi
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %edi,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ roll $30,%r11d
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ roll $1,%r14d
+ addl %eax,%esi
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
+ movl %esi,%ecx
+ xorl 28(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
+ movl %r13d,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
+ movl %r12d,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
+ movl %r11d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,32(%rsp)
+ movl %edi,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,36(%rsp)
+ movl %esi,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal 1859775393(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
+ movl %r13d,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
+ movl %r12d,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
+ movl %r11d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
+ movl %edi,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,56(%rsp)
+ movl %esi,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,60(%rsp)
+ movl %r13d,%ecx
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%r14d
+ leal 1859775393(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
+ movl %r12d,%ecx
+ xorl 12(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
+ movl %r11d,%ecx
+ xorl 16(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
+ movl %edi,%ecx
+ xorl 20(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
+ movl %esi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,16(%rsp)
+ movl %r13d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,20(%rsp)
+ movl %r12d,%ecx
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
+ movl %r11d,%ecx
+ xorl 36(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
+ movl %edi,%ecx
+ xorl 40(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
+ xorl 48(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 8(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,40(%rsp)
+ movl %edi,%ebx
+ xorl 52(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 12(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,44(%rsp)
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 16(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%ebp
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%r14d
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
+ xorl 8(%rsp),%edx
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 32(%rsp),%edx
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%edx
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ebx
+ xorl 12(%rsp),%ebp
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 36(%rsp),%ebp
+ leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%ebp
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 40(%rsp),%r14d
+ leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%r14d
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%edx
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%ebp
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%r14d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
+ xorl 32(%rsp),%edx
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 56(%rsp),%edx
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%edx
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ebx
+ xorl 36(%rsp),%ebp
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 60(%rsp),%ebp
+ leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%ebp
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 0(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
+ xorl 56(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 16(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %esi,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r13d,%ecx
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %r12d,%ecx
+ xorl 4(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %r11d,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %edi,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %esi,%ecx
+ xorl 16(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r13d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %r12d,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ leal -899497514(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
+ movl %r11d,%ecx
+ xorl 28(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
+ movl %edi,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
+ movl %esi,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
+ movl %r13d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal -899497514(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
+ movl %edi,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
+ movl %esi,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
+ movl %r13d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 0(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %r11d,%eax
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ leal -899497514(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ addl 0(%r8),%esi
+ addl 4(%r8),%edi
+ addl 8(%r8),%r11d
+ addl 12(%r8),%r12d
+ addl 16(%r8),%r13d
+ movl %esi,0(%r8)
+ movl %edi,4(%r8)
+ movl %r11d,8(%r8)
+ movl %r12d,12(%r8)
+ movl %r13d,16(%r8)
+
+ subq $1,%r10
+ leaq 64(%r9),%r9
+ jnz .Lloop
+
+ movq 64(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ ret
+.cfi_endproc
+.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+.globl sha1_block_data_order_hw
+.hidden sha1_block_data_order_hw
+.type sha1_block_data_order_hw,@function
+.align 32
+sha1_block_data_order_hw:
+.cfi_startproc
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%r8
+ paddd %xmm4,%xmm1
+ cmovneq %r8,%rsi
+ prefetcht0 512(%rsi)
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz .Loop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+ ret
+.cfi_endproc
+.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
+.globl sha1_block_data_order_ssse3
+.hidden sha1_block_data_order_ssse3
+.type sha1_block_data_order_ssse3,@function
+.align 16
+sha1_block_data_order_ssse3:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ addq $64,%r9
+ paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ jmp .Loop_ssse3
+.align 16
+.Loop_ssse3:
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
+ movdqa %xmm3,%xmm8
+ paddd %xmm3,%xmm9
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ psrldq $4,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm2,%xmm8
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ pxor %xmm8,%xmm4
+ xorl %ebx,%eax
+ roll $5,%ebp
+ movdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ psrld $31,%xmm8
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm8,%xmm4
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ xorl %ebp,%edx
+ movdqa -64(%r14),%xmm10
+ roll $5,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
+ movdqa %xmm4,%xmm9
+ paddd %xmm4,%xmm10
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm9
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ pxor %xmm9,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm10,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ psrld $31,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
+ psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
+ por %xmm9,%xmm5
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ xorl %eax,%ebp
+ movdqa -32(%r14),%xmm8
+ roll $5,%edx
+ addl %edi,%ecx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
+ movdqa %xmm5,%xmm10
+ paddd %xmm5,%xmm8
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm10
+ andl %edx,%edi
+ xorl %ebp,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm10
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ pxor %xmm10,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm8,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ psrld $31,%xmm10
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
+ por %xmm10,%xmm6
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ xorl %ebx,%eax
+ movdqa -32(%r14),%xmm9
+ roll $5,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
+ movdqa %xmm6,%xmm8
+ paddd %xmm6,%xmm9
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm8
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm8
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ pxor %xmm8,%xmm7
+ xorl %ebp,%edx
+ roll $5,%ecx
+ movdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ psrld $31,%xmm8
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm8,%xmm7
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ xorl %ecx,%ebx
+ movdqa -32(%r14),%xmm10
+ roll $5,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%edi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
+ paddd %xmm7,%xmm10
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ movdqa %xmm0,%xmm9
+ xorl %eax,%ebp
+ roll $5,%edx
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ pslld $2,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ psrld $30,%xmm9
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ por %xmm9,%xmm0
+ xorl %ebp,%edx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebx
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm8,0(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 24(%rsp),%ecx
+ pslld $2,%xmm1
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm10
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm10,%xmm1
+ addl %edx,%ecx
+ addl 28(%rsp),%ebx
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r14),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ movdqa %xmm9,16(%rsp)
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 40(%rsp),%edx
+ pslld $2,%xmm2
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ psrld $30,%xmm8
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ por %xmm8,%xmm2
+ addl %ebp,%edx
+ addl 44(%rsp),%ecx
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ movdqa %xmm10,%xmm8
+ rorl $7,%edx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
+ movdqa %xmm10,32(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 56(%rsp),%ebp
+ pslld $2,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%edi
+ psrld $30,%xmm9
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ por %xmm9,%xmm3
+ addl %eax,%ebp
+ addl 60(%rsp),%edx
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebp
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ movdqa %xmm8,48(%rsp)
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 8(%rsp),%eax
+ pslld $2,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%edi
+ psrld $30,%xmm10
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ por %xmm10,%xmm4
+ addl %ebx,%eax
+ addl 12(%rsp),%ebp
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%eax
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
+ movdqa %xmm9,0(%rsp)
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 24(%rsp),%ebx
+ pslld $2,%xmm5
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ psrld $30,%xmm8
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ por %xmm8,%xmm5
+ addl %ecx,%ebx
+ addl 28(%rsp),%eax
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ punpcklqdq %xmm5,%xmm9
+ movl %eax,%edi
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
+ xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movdqa %xmm6,%xmm9
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ pslld $2,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ psrld $30,%xmm9
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
+ rorl $7,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ pxor %xmm3,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ punpcklqdq %xmm6,%xmm10
+ movl %ebx,%edi
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa 32(%r14),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
+ xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movdqa %xmm7,%xmm10
+ movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ pslld $2,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ psrld $30,%xmm10
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
+ rorl $7,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ pxor %xmm4,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ punpcklqdq %xmm7,%xmm8
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
+ xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movdqa %xmm0,%xmm8
+ movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ pslld $2,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ psrld $30,%xmm8
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
+ rorl $7,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ punpcklqdq %xmm0,%xmm9
+ movl %edx,%edi
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
+ xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movdqa %xmm1,%xmm9
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ pslld $2,%xmm1
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ psrld $30,%xmm9
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
+ rorl $7,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ punpcklqdq %xmm1,%xmm10
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
+ xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movdqa %xmm2,%xmm10
+ movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ pslld $2,%xmm2
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ psrld $30,%xmm10
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
+ rorl $7,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%ebx
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm9,32(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 56(%rsp),%ecx
+ pslld $2,%xmm3
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm8
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm8,%xmm3
+ addl %edx,%ecx
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa %xmm10,48(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_ssse3
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+.byte 102,15,56,0,206
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ paddd %xmm9,%xmm0
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+.byte 102,15,56,0,214
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ paddd %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+.byte 102,15,56,0,222
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ paddd %xmm9,%xmm2
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ ret
+.cfi_endproc
+.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.globl sha1_block_data_order_avx
+.hidden sha1_block_data_order_avx
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r14),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r14),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r14),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_avx
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ ret
+.cfi_endproc
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.globl sha1_block_data_order_avx2
+.hidden sha1_block_data_order_avx2
+.type sha1_block_data_order_avx2,@function
+.align 16
+sha1_block_data_order_avx2:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ vzeroupper
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ leaq -640(%rsp),%rsp
+ shlq $6,%r10
+ leaq 64(%r9),%r13
+ andq $-128,%rsp
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+ movl 4(%r8),%ebp
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl 16(%r8),%esi
+ vmovdqu 64(%r14),%ymm6
+
+ vmovdqu (%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ leaq 64(%r9),%r9
+ vinserti128 $1,(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vpshufb %ymm6,%ymm0,%ymm0
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vpshufb %ymm6,%ymm1,%ymm1
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ vpshufb %ymm6,%ymm2,%ymm2
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm3,%ymm3
+
+ vpaddd %ymm11,%ymm0,%ymm4
+ vpaddd %ymm11,%ymm1,%ymm5
+ vmovdqu %ymm4,0(%rsp)
+ vpaddd %ymm11,%ymm2,%ymm6
+ vmovdqu %ymm5,32(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ vmovdqu %ymm6,64(%rsp)
+ vmovdqu %ymm7,96(%rsp)
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ vpsrldq $4,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $31,%ymm4,%ymm8
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ vpxor %ymm10,%ymm4,%ymm4
+ vpaddd %ymm11,%ymm4,%ymm9
+ vmovdqu %ymm9,128(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ vpsrldq $4,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm5,%ymm9
+ vmovdqu %ymm9,160(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ vpsrldq $4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $31,%ymm6,%ymm8
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ vpxor %ymm10,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm6,%ymm9
+ vmovdqu %ymm9,192(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ vpsrldq $4,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm7,%ymm8
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ vpxor %ymm10,%ymm7,%ymm7
+ vpaddd %ymm11,%ymm7,%ymm9
+ vmovdqu %ymm9,224(%rsp)
+ leaq 128(%rsp),%r13
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ jmp .Lalign32_1
+.align 32
+.Lalign32_1:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpxor %ymm1,%ymm0,%ymm0
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vpor %ymm8,%ymm0,%ymm0
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ vmovdqu %ymm9,256(%rsp)
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpxor %ymm2,%ymm1,%ymm1
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vpor %ymm8,%ymm1,%ymm1
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vmovdqu %ymm9,288(%rsp)
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ vpxor %ymm3,%ymm2,%ymm2
+ vmovdqu 0(%r14),%ymm11
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vpor %ymm8,%ymm2,%ymm2
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vmovdqu %ymm9,320(%rsp)
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ vpxor %ymm4,%ymm3,%ymm3
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ vpor %ymm8,%ymm3,%ymm3
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vmovdqu %ymm9,352(%rsp)
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ vpsrld $30,%ymm4,%ymm8
+ vpslld $2,%ymm4,%ymm4
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpor %ymm8,%ymm4,%ymm4
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpaddd %ymm11,%ymm4,%ymm9
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ vmovdqu %ymm9,384(%rsp)
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm6,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpxor %ymm8,%ymm5,%ymm5
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ vpsrld $30,%ymm5,%ymm8
+ vpslld $2,%ymm5,%ymm5
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vpor %ymm8,%ymm5,%ymm5
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ vmovdqu %ymm9,416(%rsp)
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ vpxor %ymm8,%ymm6,%ymm6
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ vpsrld $30,%ymm6,%ymm8
+ vpslld $2,%ymm6,%ymm6
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpor %ymm8,%ymm6,%ymm6
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ vmovdqu %ymm9,448(%rsp)
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm0,%ymm7,%ymm7
+ vmovdqu 32(%r14),%ymm11
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpxor %ymm8,%ymm7,%ymm7
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ vpsrld $30,%ymm7,%ymm8
+ vpslld $2,%ymm7,%ymm7
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpor %ymm8,%ymm7,%ymm7
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ vmovdqu %ymm9,480(%rsp)
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ jmp .Lalign32_2
+.align 32
+.Lalign32_2:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ vpxor %ymm1,%ymm0,%ymm0
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ vpor %ymm8,%ymm0,%ymm0
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ vmovdqu %ymm9,512(%rsp)
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm2,%ymm1,%ymm1
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ vpor %ymm8,%ymm1,%ymm1
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ vmovdqu %ymm9,544(%rsp)
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ vpxor %ymm3,%ymm2,%ymm2
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ vpor %ymm8,%ymm2,%ymm2
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ vmovdqu %ymm9,576(%rsp)
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ vpxor %ymm4,%ymm3,%ymm3
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ vpor %ymm8,%ymm3,%ymm3
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vmovdqu %ymm9,608(%rsp)
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%r9),%r13
+ leaq 128(%r9),%rdi
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ je .Ldone_avx2
+ vmovdqu 64(%r14),%ymm6
+ cmpq %r10,%rdi
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 $1,0(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ leaq 128+16(%rsp),%r13
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ subq $-128,%r9
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm0,%ymm0
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpshufb %ymm6,%ymm1,%ymm1
+ vpaddd %ymm11,%ymm0,%ymm8
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vmovdqu %ymm8,0(%rsp)
+ vpshufb %ymm6,%ymm2,%ymm2
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ vmovdqu %ymm9,32(%rsp)
+ vpshufb %ymm6,%ymm3,%ymm3
+ vpaddd %ymm11,%ymm2,%ymm6
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ jmp .Lalign32_3
+.align 32
+.Lalign32_3:
+ vmovdqu %ymm6,64(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vmovdqu %ymm7,96(%rsp)
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ vpsrldq $4,%ymm3,%ymm8
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ vpsrld $31,%ymm4,%ymm8
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm10,%ymm4,%ymm4
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpaddd %ymm11,%ymm4,%ymm9
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vmovdqu %ymm9,128(%rsp)
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrldq $4,%ymm4,%ymm8
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ vpxor %ymm10,%ymm5,%ymm5
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vmovdqu %ymm9,160(%rsp)
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpsrldq $4,%ymm5,%ymm8
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm8,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpsrld $31,%ymm6,%ymm8
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ vpxor %ymm10,%ymm6,%ymm6
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vmovdqu %ymm9,192(%rsp)
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpsrldq $4,%ymm6,%ymm8
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm8,%ymm7,%ymm7
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ vpsrld $31,%ymm7,%ymm8
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ vpxor %ymm10,%ymm7,%ymm7
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vmovdqu %ymm9,224(%rsp)
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%rsp),%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ ret
+.cfi_endproc
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+.section .rodata
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+.text
+#endif
diff --git a/gen/bcm/sha1-x86_64-win.asm b/gen/bcm/sha1-x86_64-win.asm
new file mode 100644
index 0000000..92e9b9c
--- /dev/null
+++ b/gen/bcm/sha1-x86_64-win.asm
@@ -0,0 +1,5768 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+global sha1_block_data_order_nohw
+
+ALIGN 16
+sha1_block_data_order_nohw:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_nohw:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ mov r8,rdi
+ sub rsp,72
+ mov r9,rsi
+ and rsp,-64
+ mov r10,rdx
+ mov QWORD[64+rsp],rax
+
+$L$prologue:
+
+ mov esi,DWORD[r8]
+ mov edi,DWORD[4+r8]
+ mov r11d,DWORD[8+r8]
+ mov r12d,DWORD[12+r8]
+ mov r13d,DWORD[16+r8]
+ jmp NEAR $L$loop
+
+ALIGN 16
+$L$loop:
+ mov edx,DWORD[r9]
+ bswap edx
+ mov ebp,DWORD[4+r9]
+ mov eax,r12d
+ mov DWORD[rsp],edx
+ mov ecx,esi
+ bswap ebp
+ xor eax,r11d
+ rol ecx,5
+ and eax,edi
+ lea r13d,[1518500249+r13*1+rdx]
+ add r13d,ecx
+ xor eax,r12d
+ rol edi,30
+ add r13d,eax
+ mov r14d,DWORD[8+r9]
+ mov eax,r11d
+ mov DWORD[4+rsp],ebp
+ mov ecx,r13d
+ bswap r14d
+ xor eax,edi
+ rol ecx,5
+ and eax,esi
+ lea r12d,[1518500249+r12*1+rbp]
+ add r12d,ecx
+ xor eax,r11d
+ rol esi,30
+ add r12d,eax
+ mov edx,DWORD[12+r9]
+ mov eax,edi
+ mov DWORD[8+rsp],r14d
+ mov ecx,r12d
+ bswap edx
+ xor eax,esi
+ rol ecx,5
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+r14]
+ add r11d,ecx
+ xor eax,edi
+ rol r13d,30
+ add r11d,eax
+ mov ebp,DWORD[16+r9]
+ mov eax,esi
+ mov DWORD[12+rsp],edx
+ mov ecx,r11d
+ bswap ebp
+ xor eax,r13d
+ rol ecx,5
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+rdx]
+ add edi,ecx
+ xor eax,esi
+ rol r12d,30
+ add edi,eax
+ mov r14d,DWORD[20+r9]
+ mov eax,r13d
+ mov DWORD[16+rsp],ebp
+ mov ecx,edi
+ bswap r14d
+ xor eax,r12d
+ rol ecx,5
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+rbp]
+ add esi,ecx
+ xor eax,r13d
+ rol r11d,30
+ add esi,eax
+ mov edx,DWORD[24+r9]
+ mov eax,r12d
+ mov DWORD[20+rsp],r14d
+ mov ecx,esi
+ bswap edx
+ xor eax,r11d
+ rol ecx,5
+ and eax,edi
+ lea r13d,[1518500249+r13*1+r14]
+ add r13d,ecx
+ xor eax,r12d
+ rol edi,30
+ add r13d,eax
+ mov ebp,DWORD[28+r9]
+ mov eax,r11d
+ mov DWORD[24+rsp],edx
+ mov ecx,r13d
+ bswap ebp
+ xor eax,edi
+ rol ecx,5
+ and eax,esi
+ lea r12d,[1518500249+r12*1+rdx]
+ add r12d,ecx
+ xor eax,r11d
+ rol esi,30
+ add r12d,eax
+ mov r14d,DWORD[32+r9]
+ mov eax,edi
+ mov DWORD[28+rsp],ebp
+ mov ecx,r12d
+ bswap r14d
+ xor eax,esi
+ rol ecx,5
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+rbp]
+ add r11d,ecx
+ xor eax,edi
+ rol r13d,30
+ add r11d,eax
+ mov edx,DWORD[36+r9]
+ mov eax,esi
+ mov DWORD[32+rsp],r14d
+ mov ecx,r11d
+ bswap edx
+ xor eax,r13d
+ rol ecx,5
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+r14]
+ add edi,ecx
+ xor eax,esi
+ rol r12d,30
+ add edi,eax
+ mov ebp,DWORD[40+r9]
+ mov eax,r13d
+ mov DWORD[36+rsp],edx
+ mov ecx,edi
+ bswap ebp
+ xor eax,r12d
+ rol ecx,5
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+rdx]
+ add esi,ecx
+ xor eax,r13d
+ rol r11d,30
+ add esi,eax
+ mov r14d,DWORD[44+r9]
+ mov eax,r12d
+ mov DWORD[40+rsp],ebp
+ mov ecx,esi
+ bswap r14d
+ xor eax,r11d
+ rol ecx,5
+ and eax,edi
+ lea r13d,[1518500249+r13*1+rbp]
+ add r13d,ecx
+ xor eax,r12d
+ rol edi,30
+ add r13d,eax
+ mov edx,DWORD[48+r9]
+ mov eax,r11d
+ mov DWORD[44+rsp],r14d
+ mov ecx,r13d
+ bswap edx
+ xor eax,edi
+ rol ecx,5
+ and eax,esi
+ lea r12d,[1518500249+r12*1+r14]
+ add r12d,ecx
+ xor eax,r11d
+ rol esi,30
+ add r12d,eax
+ mov ebp,DWORD[52+r9]
+ mov eax,edi
+ mov DWORD[48+rsp],edx
+ mov ecx,r12d
+ bswap ebp
+ xor eax,esi
+ rol ecx,5
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+rdx]
+ add r11d,ecx
+ xor eax,edi
+ rol r13d,30
+ add r11d,eax
+ mov r14d,DWORD[56+r9]
+ mov eax,esi
+ mov DWORD[52+rsp],ebp
+ mov ecx,r11d
+ bswap r14d
+ xor eax,r13d
+ rol ecx,5
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+rbp]
+ add edi,ecx
+ xor eax,esi
+ rol r12d,30
+ add edi,eax
+ mov edx,DWORD[60+r9]
+ mov eax,r13d
+ mov DWORD[56+rsp],r14d
+ mov ecx,edi
+ bswap edx
+ xor eax,r12d
+ rol ecx,5
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+r14]
+ add esi,ecx
+ xor eax,r13d
+ rol r11d,30
+ add esi,eax
+ xor ebp,DWORD[rsp]
+ mov eax,r12d
+ mov DWORD[60+rsp],edx
+ mov ecx,esi
+ xor ebp,DWORD[8+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[32+rsp]
+ and eax,edi
+ lea r13d,[1518500249+r13*1+rdx]
+ rol edi,30
+ xor eax,r12d
+ add r13d,ecx
+ rol ebp,1
+ add r13d,eax
+ xor r14d,DWORD[4+rsp]
+ mov eax,r11d
+ mov DWORD[rsp],ebp
+ mov ecx,r13d
+ xor r14d,DWORD[12+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[36+rsp]
+ and eax,esi
+ lea r12d,[1518500249+r12*1+rbp]
+ rol esi,30
+ xor eax,r11d
+ add r12d,ecx
+ rol r14d,1
+ add r12d,eax
+ xor edx,DWORD[8+rsp]
+ mov eax,edi
+ mov DWORD[4+rsp],r14d
+ mov ecx,r12d
+ xor edx,DWORD[16+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[40+rsp]
+ and eax,r13d
+ lea r11d,[1518500249+r11*1+r14]
+ rol r13d,30
+ xor eax,edi
+ add r11d,ecx
+ rol edx,1
+ add r11d,eax
+ xor ebp,DWORD[12+rsp]
+ mov eax,esi
+ mov DWORD[8+rsp],edx
+ mov ecx,r11d
+ xor ebp,DWORD[20+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[44+rsp]
+ and eax,r12d
+ lea edi,[1518500249+rdi*1+rdx]
+ rol r12d,30
+ xor eax,esi
+ add edi,ecx
+ rol ebp,1
+ add edi,eax
+ xor r14d,DWORD[16+rsp]
+ mov eax,r13d
+ mov DWORD[12+rsp],ebp
+ mov ecx,edi
+ xor r14d,DWORD[24+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor r14d,DWORD[48+rsp]
+ and eax,r11d
+ lea esi,[1518500249+rsi*1+rbp]
+ rol r11d,30
+ xor eax,r13d
+ add esi,ecx
+ rol r14d,1
+ add esi,eax
+ xor edx,DWORD[20+rsp]
+ mov eax,edi
+ mov DWORD[16+rsp],r14d
+ mov ecx,esi
+ xor edx,DWORD[28+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor edx,DWORD[52+rsp]
+ lea r13d,[1859775393+r13*1+r14]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol edx,1
+ xor ebp,DWORD[24+rsp]
+ mov eax,esi
+ mov DWORD[20+rsp],edx
+ mov ecx,r13d
+ xor ebp,DWORD[32+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[56+rsp]
+ lea r12d,[1859775393+r12*1+rdx]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol ebp,1
+ xor r14d,DWORD[28+rsp]
+ mov eax,r13d
+ mov DWORD[24+rsp],ebp
+ mov ecx,r12d
+ xor r14d,DWORD[36+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[60+rsp]
+ lea r11d,[1859775393+r11*1+rbp]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol r14d,1
+ xor edx,DWORD[32+rsp]
+ mov eax,r12d
+ mov DWORD[28+rsp],r14d
+ mov ecx,r11d
+ xor edx,DWORD[40+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[rsp]
+ lea edi,[1859775393+rdi*1+r14]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol edx,1
+ xor ebp,DWORD[36+rsp]
+ mov eax,r11d
+ mov DWORD[32+rsp],edx
+ mov ecx,edi
+ xor ebp,DWORD[44+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[4+rsp]
+ lea esi,[1859775393+rsi*1+rdx]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol ebp,1
+ xor r14d,DWORD[40+rsp]
+ mov eax,edi
+ mov DWORD[36+rsp],ebp
+ mov ecx,esi
+ xor r14d,DWORD[48+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor r14d,DWORD[8+rsp]
+ lea r13d,[1859775393+r13*1+rbp]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol r14d,1
+ xor edx,DWORD[44+rsp]
+ mov eax,esi
+ mov DWORD[40+rsp],r14d
+ mov ecx,r13d
+ xor edx,DWORD[52+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor edx,DWORD[12+rsp]
+ lea r12d,[1859775393+r12*1+r14]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol edx,1
+ xor ebp,DWORD[48+rsp]
+ mov eax,r13d
+ mov DWORD[44+rsp],edx
+ mov ecx,r12d
+ xor ebp,DWORD[56+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor ebp,DWORD[16+rsp]
+ lea r11d,[1859775393+r11*1+rdx]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol ebp,1
+ xor r14d,DWORD[52+rsp]
+ mov eax,r12d
+ mov DWORD[48+rsp],ebp
+ mov ecx,r11d
+ xor r14d,DWORD[60+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor r14d,DWORD[20+rsp]
+ lea edi,[1859775393+rdi*1+rbp]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol r14d,1
+ xor edx,DWORD[56+rsp]
+ mov eax,r11d
+ mov DWORD[52+rsp],r14d
+ mov ecx,edi
+ xor edx,DWORD[rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor edx,DWORD[24+rsp]
+ lea esi,[1859775393+rsi*1+r14]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol edx,1
+ xor ebp,DWORD[60+rsp]
+ mov eax,edi
+ mov DWORD[56+rsp],edx
+ mov ecx,esi
+ xor ebp,DWORD[4+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor ebp,DWORD[28+rsp]
+ lea r13d,[1859775393+r13*1+rdx]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol ebp,1
+ xor r14d,DWORD[rsp]
+ mov eax,esi
+ mov DWORD[60+rsp],ebp
+ mov ecx,r13d
+ xor r14d,DWORD[8+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor r14d,DWORD[32+rsp]
+ lea r12d,[1859775393+r12*1+rbp]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol r14d,1
+ xor edx,DWORD[4+rsp]
+ mov eax,r13d
+ mov DWORD[rsp],r14d
+ mov ecx,r12d
+ xor edx,DWORD[12+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor edx,DWORD[36+rsp]
+ lea r11d,[1859775393+r11*1+r14]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol edx,1
+ xor ebp,DWORD[8+rsp]
+ mov eax,r12d
+ mov DWORD[4+rsp],edx
+ mov ecx,r11d
+ xor ebp,DWORD[16+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor ebp,DWORD[40+rsp]
+ lea edi,[1859775393+rdi*1+rdx]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol ebp,1
+ xor r14d,DWORD[12+rsp]
+ mov eax,r11d
+ mov DWORD[8+rsp],ebp
+ mov ecx,edi
+ xor r14d,DWORD[20+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor r14d,DWORD[44+rsp]
+ lea esi,[1859775393+rsi*1+rbp]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol r14d,1
+ xor edx,DWORD[16+rsp]
+ mov eax,edi
+ mov DWORD[12+rsp],r14d
+ mov ecx,esi
+ xor edx,DWORD[24+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor edx,DWORD[48+rsp]
+ lea r13d,[1859775393+r13*1+r14]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol edx,1
+ xor ebp,DWORD[20+rsp]
+ mov eax,esi
+ mov DWORD[16+rsp],edx
+ mov ecx,r13d
+ xor ebp,DWORD[28+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[52+rsp]
+ lea r12d,[1859775393+r12*1+rdx]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol ebp,1
+ xor r14d,DWORD[24+rsp]
+ mov eax,r13d
+ mov DWORD[20+rsp],ebp
+ mov ecx,r12d
+ xor r14d,DWORD[32+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[56+rsp]
+ lea r11d,[1859775393+r11*1+rbp]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol r14d,1
+ xor edx,DWORD[28+rsp]
+ mov eax,r12d
+ mov DWORD[24+rsp],r14d
+ mov ecx,r11d
+ xor edx,DWORD[36+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[60+rsp]
+ lea edi,[1859775393+rdi*1+r14]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol edx,1
+ xor ebp,DWORD[32+rsp]
+ mov eax,r11d
+ mov DWORD[28+rsp],edx
+ mov ecx,edi
+ xor ebp,DWORD[40+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[rsp]
+ lea esi,[1859775393+rsi*1+rdx]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol ebp,1
+ xor r14d,DWORD[36+rsp]
+ mov eax,r12d
+ mov DWORD[32+rsp],ebp
+ mov ebx,r12d
+ xor r14d,DWORD[44+rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor r14d,DWORD[4+rsp]
+ lea r13d,[((-1894007588))+r13*1+rbp]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol r14d,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor edx,DWORD[40+rsp]
+ mov eax,r11d
+ mov DWORD[36+rsp],r14d
+ mov ebx,r11d
+ xor edx,DWORD[48+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor edx,DWORD[8+rsp]
+ lea r12d,[((-1894007588))+r12*1+r14]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol edx,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor ebp,DWORD[44+rsp]
+ mov eax,edi
+ mov DWORD[40+rsp],edx
+ mov ebx,edi
+ xor ebp,DWORD[52+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor ebp,DWORD[12+rsp]
+ lea r11d,[((-1894007588))+r11*1+rdx]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol ebp,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor r14d,DWORD[48+rsp]
+ mov eax,esi
+ mov DWORD[44+rsp],ebp
+ mov ebx,esi
+ xor r14d,DWORD[56+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor r14d,DWORD[16+rsp]
+ lea edi,[((-1894007588))+rdi*1+rbp]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol r14d,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor edx,DWORD[52+rsp]
+ mov eax,r13d
+ mov DWORD[48+rsp],r14d
+ mov ebx,r13d
+ xor edx,DWORD[60+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor edx,DWORD[20+rsp]
+ lea esi,[((-1894007588))+rsi*1+r14]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol edx,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor ebp,DWORD[56+rsp]
+ mov eax,r12d
+ mov DWORD[52+rsp],edx
+ mov ebx,r12d
+ xor ebp,DWORD[rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor ebp,DWORD[24+rsp]
+ lea r13d,[((-1894007588))+r13*1+rdx]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol ebp,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor r14d,DWORD[60+rsp]
+ mov eax,r11d
+ mov DWORD[56+rsp],ebp
+ mov ebx,r11d
+ xor r14d,DWORD[4+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor r14d,DWORD[28+rsp]
+ lea r12d,[((-1894007588))+r12*1+rbp]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol r14d,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor edx,DWORD[rsp]
+ mov eax,edi
+ mov DWORD[60+rsp],r14d
+ mov ebx,edi
+ xor edx,DWORD[8+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor edx,DWORD[32+rsp]
+ lea r11d,[((-1894007588))+r11*1+r14]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol edx,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor ebp,DWORD[4+rsp]
+ mov eax,esi
+ mov DWORD[rsp],edx
+ mov ebx,esi
+ xor ebp,DWORD[12+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor ebp,DWORD[36+rsp]
+ lea edi,[((-1894007588))+rdi*1+rdx]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol ebp,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor r14d,DWORD[8+rsp]
+ mov eax,r13d
+ mov DWORD[4+rsp],ebp
+ mov ebx,r13d
+ xor r14d,DWORD[16+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor r14d,DWORD[40+rsp]
+ lea esi,[((-1894007588))+rsi*1+rbp]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol r14d,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor edx,DWORD[12+rsp]
+ mov eax,r12d
+ mov DWORD[8+rsp],r14d
+ mov ebx,r12d
+ xor edx,DWORD[20+rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor edx,DWORD[44+rsp]
+ lea r13d,[((-1894007588))+r13*1+r14]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol edx,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor ebp,DWORD[16+rsp]
+ mov eax,r11d
+ mov DWORD[12+rsp],edx
+ mov ebx,r11d
+ xor ebp,DWORD[24+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor ebp,DWORD[48+rsp]
+ lea r12d,[((-1894007588))+r12*1+rdx]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol ebp,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor r14d,DWORD[20+rsp]
+ mov eax,edi
+ mov DWORD[16+rsp],ebp
+ mov ebx,edi
+ xor r14d,DWORD[28+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor r14d,DWORD[52+rsp]
+ lea r11d,[((-1894007588))+r11*1+rbp]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol r14d,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor edx,DWORD[24+rsp]
+ mov eax,esi
+ mov DWORD[20+rsp],r14d
+ mov ebx,esi
+ xor edx,DWORD[32+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor edx,DWORD[56+rsp]
+ lea edi,[((-1894007588))+rdi*1+r14]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol edx,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor ebp,DWORD[28+rsp]
+ mov eax,r13d
+ mov DWORD[24+rsp],edx
+ mov ebx,r13d
+ xor ebp,DWORD[36+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor ebp,DWORD[60+rsp]
+ lea esi,[((-1894007588))+rsi*1+rdx]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol ebp,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor r14d,DWORD[32+rsp]
+ mov eax,r12d
+ mov DWORD[28+rsp],ebp
+ mov ebx,r12d
+ xor r14d,DWORD[40+rsp]
+ and eax,r11d
+ mov ecx,esi
+ xor r14d,DWORD[rsp]
+ lea r13d,[((-1894007588))+r13*1+rbp]
+ xor ebx,r11d
+ rol ecx,5
+ add r13d,eax
+ rol r14d,1
+ and ebx,edi
+ add r13d,ecx
+ rol edi,30
+ add r13d,ebx
+ xor edx,DWORD[36+rsp]
+ mov eax,r11d
+ mov DWORD[32+rsp],r14d
+ mov ebx,r11d
+ xor edx,DWORD[44+rsp]
+ and eax,edi
+ mov ecx,r13d
+ xor edx,DWORD[4+rsp]
+ lea r12d,[((-1894007588))+r12*1+r14]
+ xor ebx,edi
+ rol ecx,5
+ add r12d,eax
+ rol edx,1
+ and ebx,esi
+ add r12d,ecx
+ rol esi,30
+ add r12d,ebx
+ xor ebp,DWORD[40+rsp]
+ mov eax,edi
+ mov DWORD[36+rsp],edx
+ mov ebx,edi
+ xor ebp,DWORD[48+rsp]
+ and eax,esi
+ mov ecx,r12d
+ xor ebp,DWORD[8+rsp]
+ lea r11d,[((-1894007588))+r11*1+rdx]
+ xor ebx,esi
+ rol ecx,5
+ add r11d,eax
+ rol ebp,1
+ and ebx,r13d
+ add r11d,ecx
+ rol r13d,30
+ add r11d,ebx
+ xor r14d,DWORD[44+rsp]
+ mov eax,esi
+ mov DWORD[40+rsp],ebp
+ mov ebx,esi
+ xor r14d,DWORD[52+rsp]
+ and eax,r13d
+ mov ecx,r11d
+ xor r14d,DWORD[12+rsp]
+ lea edi,[((-1894007588))+rdi*1+rbp]
+ xor ebx,r13d
+ rol ecx,5
+ add edi,eax
+ rol r14d,1
+ and ebx,r12d
+ add edi,ecx
+ rol r12d,30
+ add edi,ebx
+ xor edx,DWORD[48+rsp]
+ mov eax,r13d
+ mov DWORD[44+rsp],r14d
+ mov ebx,r13d
+ xor edx,DWORD[56+rsp]
+ and eax,r12d
+ mov ecx,edi
+ xor edx,DWORD[16+rsp]
+ lea esi,[((-1894007588))+rsi*1+r14]
+ xor ebx,r12d
+ rol ecx,5
+ add esi,eax
+ rol edx,1
+ and ebx,r11d
+ add esi,ecx
+ rol r11d,30
+ add esi,ebx
+ xor ebp,DWORD[52+rsp]
+ mov eax,edi
+ mov DWORD[48+rsp],edx
+ mov ecx,esi
+ xor ebp,DWORD[60+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor ebp,DWORD[20+rsp]
+ lea r13d,[((-899497514))+r13*1+rdx]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol ebp,1
+ xor r14d,DWORD[56+rsp]
+ mov eax,esi
+ mov DWORD[52+rsp],ebp
+ mov ecx,r13d
+ xor r14d,DWORD[rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor r14d,DWORD[24+rsp]
+ lea r12d,[((-899497514))+r12*1+rbp]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol r14d,1
+ xor edx,DWORD[60+rsp]
+ mov eax,r13d
+ mov DWORD[56+rsp],r14d
+ mov ecx,r12d
+ xor edx,DWORD[4+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor edx,DWORD[28+rsp]
+ lea r11d,[((-899497514))+r11*1+r14]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol edx,1
+ xor ebp,DWORD[rsp]
+ mov eax,r12d
+ mov DWORD[60+rsp],edx
+ mov ecx,r11d
+ xor ebp,DWORD[8+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor ebp,DWORD[32+rsp]
+ lea edi,[((-899497514))+rdi*1+rdx]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol ebp,1
+ xor r14d,DWORD[4+rsp]
+ mov eax,r11d
+ mov DWORD[rsp],ebp
+ mov ecx,edi
+ xor r14d,DWORD[12+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor r14d,DWORD[36+rsp]
+ lea esi,[((-899497514))+rsi*1+rbp]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol r14d,1
+ xor edx,DWORD[8+rsp]
+ mov eax,edi
+ mov DWORD[4+rsp],r14d
+ mov ecx,esi
+ xor edx,DWORD[16+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor edx,DWORD[40+rsp]
+ lea r13d,[((-899497514))+r13*1+r14]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol edx,1
+ xor ebp,DWORD[12+rsp]
+ mov eax,esi
+ mov DWORD[8+rsp],edx
+ mov ecx,r13d
+ xor ebp,DWORD[20+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor ebp,DWORD[44+rsp]
+ lea r12d,[((-899497514))+r12*1+rdx]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol ebp,1
+ xor r14d,DWORD[16+rsp]
+ mov eax,r13d
+ mov DWORD[12+rsp],ebp
+ mov ecx,r12d
+ xor r14d,DWORD[24+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor r14d,DWORD[48+rsp]
+ lea r11d,[((-899497514))+r11*1+rbp]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol r14d,1
+ xor edx,DWORD[20+rsp]
+ mov eax,r12d
+ mov DWORD[16+rsp],r14d
+ mov ecx,r11d
+ xor edx,DWORD[28+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor edx,DWORD[52+rsp]
+ lea edi,[((-899497514))+rdi*1+r14]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol edx,1
+ xor ebp,DWORD[24+rsp]
+ mov eax,r11d
+ mov DWORD[20+rsp],edx
+ mov ecx,edi
+ xor ebp,DWORD[32+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor ebp,DWORD[56+rsp]
+ lea esi,[((-899497514))+rsi*1+rdx]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol ebp,1
+ xor r14d,DWORD[28+rsp]
+ mov eax,edi
+ mov DWORD[24+rsp],ebp
+ mov ecx,esi
+ xor r14d,DWORD[36+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor r14d,DWORD[60+rsp]
+ lea r13d,[((-899497514))+r13*1+rbp]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol r14d,1
+ xor edx,DWORD[32+rsp]
+ mov eax,esi
+ mov DWORD[28+rsp],r14d
+ mov ecx,r13d
+ xor edx,DWORD[40+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor edx,DWORD[rsp]
+ lea r12d,[((-899497514))+r12*1+r14]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol edx,1
+ xor ebp,DWORD[36+rsp]
+ mov eax,r13d
+
+ mov ecx,r12d
+ xor ebp,DWORD[44+rsp]
+ xor eax,edi
+ rol ecx,5
+ xor ebp,DWORD[4+rsp]
+ lea r11d,[((-899497514))+r11*1+rdx]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol ebp,1
+ xor r14d,DWORD[40+rsp]
+ mov eax,r12d
+
+ mov ecx,r11d
+ xor r14d,DWORD[48+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor r14d,DWORD[8+rsp]
+ lea edi,[((-899497514))+rdi*1+rbp]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol r14d,1
+ xor edx,DWORD[44+rsp]
+ mov eax,r11d
+
+ mov ecx,edi
+ xor edx,DWORD[52+rsp]
+ xor eax,r13d
+ rol ecx,5
+ xor edx,DWORD[12+rsp]
+ lea esi,[((-899497514))+rsi*1+r14]
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ rol edx,1
+ xor ebp,DWORD[48+rsp]
+ mov eax,edi
+
+ mov ecx,esi
+ xor ebp,DWORD[56+rsp]
+ xor eax,r12d
+ rol ecx,5
+ xor ebp,DWORD[16+rsp]
+ lea r13d,[((-899497514))+r13*1+rdx]
+ xor eax,r11d
+ add r13d,ecx
+ rol edi,30
+ add r13d,eax
+ rol ebp,1
+ xor r14d,DWORD[52+rsp]
+ mov eax,esi
+
+ mov ecx,r13d
+ xor r14d,DWORD[60+rsp]
+ xor eax,r11d
+ rol ecx,5
+ xor r14d,DWORD[20+rsp]
+ lea r12d,[((-899497514))+r12*1+rbp]
+ xor eax,edi
+ add r12d,ecx
+ rol esi,30
+ add r12d,eax
+ rol r14d,1
+ xor edx,DWORD[56+rsp]
+ mov eax,r13d
+
+ mov ecx,r12d
+ xor edx,DWORD[rsp]
+ xor eax,edi
+ rol ecx,5
+ xor edx,DWORD[24+rsp]
+ lea r11d,[((-899497514))+r11*1+r14]
+ xor eax,esi
+ add r11d,ecx
+ rol r13d,30
+ add r11d,eax
+ rol edx,1
+ xor ebp,DWORD[60+rsp]
+ mov eax,r12d
+
+ mov ecx,r11d
+ xor ebp,DWORD[4+rsp]
+ xor eax,esi
+ rol ecx,5
+ xor ebp,DWORD[28+rsp]
+ lea edi,[((-899497514))+rdi*1+rdx]
+ xor eax,r13d
+ add edi,ecx
+ rol r12d,30
+ add edi,eax
+ rol ebp,1
+ mov eax,r11d
+ mov ecx,edi
+ xor eax,r13d
+ lea esi,[((-899497514))+rsi*1+rbp]
+ rol ecx,5
+ xor eax,r12d
+ add esi,ecx
+ rol r11d,30
+ add esi,eax
+ add esi,DWORD[r8]
+ add edi,DWORD[4+r8]
+ add r11d,DWORD[8+r8]
+ add r12d,DWORD[12+r8]
+ add r13d,DWORD[16+r8]
+ mov DWORD[r8],esi
+ mov DWORD[4+r8],edi
+ mov DWORD[8+r8],r11d
+ mov DWORD[12+r8],r12d
+ mov DWORD[16+r8],r13d
+
+ sub r10,1
+ lea r9,[64+r9]
+ jnz NEAR $L$loop
+
+ mov rsi,QWORD[64+rsp]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha1_block_data_order_nohw:
+global sha1_block_data_order_hw
+
+ALIGN 32
+sha1_block_data_order_hw:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_hw:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rsp,[((-72))+rsp]
+ movaps XMMWORD[(-8-64)+rax],xmm6
+ movaps XMMWORD[(-8-48)+rax],xmm7
+ movaps XMMWORD[(-8-32)+rax],xmm8
+ movaps XMMWORD[(-8-16)+rax],xmm9
+$L$prologue_shaext:
+ movdqu xmm0,XMMWORD[rdi]
+ movd xmm1,DWORD[16+rdi]
+ movdqa xmm3,XMMWORD[((K_XX_XX+160))]
+
+ movdqu xmm4,XMMWORD[rsi]
+ pshufd xmm0,xmm0,27
+ movdqu xmm5,XMMWORD[16+rsi]
+ pshufd xmm1,xmm1,27
+ movdqu xmm6,XMMWORD[32+rsi]
+DB 102,15,56,0,227
+ movdqu xmm7,XMMWORD[48+rsi]
+DB 102,15,56,0,235
+DB 102,15,56,0,243
+ movdqa xmm9,xmm1
+DB 102,15,56,0,251
+ jmp NEAR $L$oop_shaext
+
+ALIGN 16
+$L$oop_shaext:
+ dec rdx
+ lea r8,[64+rsi]
+ paddd xmm1,xmm4
+ cmovne rsi,r8
+ prefetcht0 [512+rsi]
+ movdqa xmm8,xmm0
+ DB 15,56,201,229
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,0
+ DB 15,56,200,213
+ pxor xmm4,xmm6
+ DB 15,56,201,238
+ DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,0
+ DB 15,56,200,206
+ pxor xmm5,xmm7
+ DB 15,56,202,236
+ DB 15,56,201,247
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,0
+ DB 15,56,200,215
+ pxor xmm6,xmm4
+ DB 15,56,201,252
+ DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,0
+ DB 15,56,200,204
+ pxor xmm7,xmm5
+ DB 15,56,202,254
+ DB 15,56,201,229
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,0
+ DB 15,56,200,213
+ pxor xmm4,xmm6
+ DB 15,56,201,238
+ DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,1
+ DB 15,56,200,206
+ pxor xmm5,xmm7
+ DB 15,56,202,236
+ DB 15,56,201,247
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,1
+ DB 15,56,200,215
+ pxor xmm6,xmm4
+ DB 15,56,201,252
+ DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,1
+ DB 15,56,200,204
+ pxor xmm7,xmm5
+ DB 15,56,202,254
+ DB 15,56,201,229
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,1
+ DB 15,56,200,213
+ pxor xmm4,xmm6
+ DB 15,56,201,238
+ DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,1
+ DB 15,56,200,206
+ pxor xmm5,xmm7
+ DB 15,56,202,236
+ DB 15,56,201,247
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,2
+ DB 15,56,200,215
+ pxor xmm6,xmm4
+ DB 15,56,201,252
+ DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,2
+ DB 15,56,200,204
+ pxor xmm7,xmm5
+ DB 15,56,202,254
+ DB 15,56,201,229
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,2
+ DB 15,56,200,213
+ pxor xmm4,xmm6
+ DB 15,56,201,238
+ DB 15,56,202,231
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,2
+ DB 15,56,200,206
+ pxor xmm5,xmm7
+ DB 15,56,202,236
+ DB 15,56,201,247
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,2
+ DB 15,56,200,215
+ pxor xmm6,xmm4
+ DB 15,56,201,252
+ DB 15,56,202,245
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,3
+ DB 15,56,200,204
+ pxor xmm7,xmm5
+ DB 15,56,202,254
+ movdqu xmm4,XMMWORD[rsi]
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,3
+ DB 15,56,200,213
+ movdqu xmm5,XMMWORD[16+rsi]
+DB 102,15,56,0,227
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,3
+ DB 15,56,200,206
+ movdqu xmm6,XMMWORD[32+rsi]
+DB 102,15,56,0,235
+
+ movdqa xmm2,xmm0
+ DB 15,58,204,193,3
+ DB 15,56,200,215
+ movdqu xmm7,XMMWORD[48+rsi]
+DB 102,15,56,0,243
+
+ movdqa xmm1,xmm0
+ DB 15,58,204,194,3
+ DB 65,15,56,200,201
+DB 102,15,56,0,251
+
+ paddd xmm0,xmm8
+ movdqa xmm9,xmm1
+
+ jnz NEAR $L$oop_shaext
+
+ pshufd xmm0,xmm0,27
+ pshufd xmm1,xmm1,27
+ movdqu XMMWORD[rdi],xmm0
+ movd DWORD[16+rdi],xmm1
+ movaps xmm6,XMMWORD[((-8-64))+rax]
+ movaps xmm7,XMMWORD[((-8-48))+rax]
+ movaps xmm8,XMMWORD[((-8-32))+rax]
+ movaps xmm9,XMMWORD[((-8-16))+rax]
+ mov rsp,rax
+$L$epilogue_shaext:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha1_block_data_order_hw:
+global sha1_block_data_order_ssse3
+
+ALIGN 16
+sha1_block_data_order_ssse3:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_ssse3:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov r11,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ lea rsp,[((-160))+rsp]
+ movaps XMMWORD[(-40-96)+r11],xmm6
+ movaps XMMWORD[(-40-80)+r11],xmm7
+ movaps XMMWORD[(-40-64)+r11],xmm8
+ movaps XMMWORD[(-40-48)+r11],xmm9
+ movaps XMMWORD[(-40-32)+r11],xmm10
+ movaps XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_ssse3:
+ and rsp,-64
+ mov r8,rdi
+ mov r9,rsi
+ mov r10,rdx
+
+ shl r10,6
+ add r10,r9
+ lea r14,[((K_XX_XX+64))]
+
+ mov eax,DWORD[r8]
+ mov ebx,DWORD[4+r8]
+ mov ecx,DWORD[8+r8]
+ mov edx,DWORD[12+r8]
+ mov esi,ebx
+ mov ebp,DWORD[16+r8]
+ mov edi,ecx
+ xor edi,edx
+ and esi,edi
+
+ movdqa xmm6,XMMWORD[64+r14]
+ movdqa xmm9,XMMWORD[((-64))+r14]
+ movdqu xmm0,XMMWORD[r9]
+ movdqu xmm1,XMMWORD[16+r9]
+ movdqu xmm2,XMMWORD[32+r9]
+ movdqu xmm3,XMMWORD[48+r9]
+DB 102,15,56,0,198
+DB 102,15,56,0,206
+DB 102,15,56,0,214
+ add r9,64
+ paddd xmm0,xmm9
+DB 102,15,56,0,222
+ paddd xmm1,xmm9
+ paddd xmm2,xmm9
+ movdqa XMMWORD[rsp],xmm0
+ psubd xmm0,xmm9
+ movdqa XMMWORD[16+rsp],xmm1
+ psubd xmm1,xmm9
+ movdqa XMMWORD[32+rsp],xmm2
+ psubd xmm2,xmm9
+ jmp NEAR $L$oop_ssse3
+ALIGN 16
+$L$oop_ssse3:
+ ror ebx,2
+ pshufd xmm4,xmm0,238
+ xor esi,edx
+ movdqa xmm8,xmm3
+ paddd xmm9,xmm3
+ mov edi,eax
+ add ebp,DWORD[rsp]
+ punpcklqdq xmm4,xmm1
+ xor ebx,ecx
+ rol eax,5
+ add ebp,esi
+ psrldq xmm8,4
+ and edi,ebx
+ xor ebx,ecx
+ pxor xmm4,xmm0
+ add ebp,eax
+ ror eax,7
+ pxor xmm8,xmm2
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[4+rsp]
+ pxor xmm4,xmm8
+ xor eax,ebx
+ rol ebp,5
+ movdqa XMMWORD[48+rsp],xmm9
+ add edx,edi
+ and esi,eax
+ movdqa xmm10,xmm4
+ xor eax,ebx
+ add edx,ebp
+ ror ebp,7
+ movdqa xmm8,xmm4
+ xor esi,ebx
+ pslldq xmm10,12
+ paddd xmm4,xmm4
+ mov edi,edx
+ add ecx,DWORD[8+rsp]
+ psrld xmm8,31
+ xor ebp,eax
+ rol edx,5
+ add ecx,esi
+ movdqa xmm9,xmm10
+ and edi,ebp
+ xor ebp,eax
+ psrld xmm10,30
+ add ecx,edx
+ ror edx,7
+ por xmm4,xmm8
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[12+rsp]
+ pslld xmm9,2
+ pxor xmm4,xmm10
+ xor edx,ebp
+ movdqa xmm10,XMMWORD[((-64))+r14]
+ rol ecx,5
+ add ebx,edi
+ and esi,edx
+ pxor xmm4,xmm9
+ xor edx,ebp
+ add ebx,ecx
+ ror ecx,7
+ pshufd xmm5,xmm1,238
+ xor esi,ebp
+ movdqa xmm9,xmm4
+ paddd xmm10,xmm4
+ mov edi,ebx
+ add eax,DWORD[16+rsp]
+ punpcklqdq xmm5,xmm2
+ xor ecx,edx
+ rol ebx,5
+ add eax,esi
+ psrldq xmm9,4
+ and edi,ecx
+ xor ecx,edx
+ pxor xmm5,xmm1
+ add eax,ebx
+ ror ebx,7
+ pxor xmm9,xmm3
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[20+rsp]
+ pxor xmm5,xmm9
+ xor ebx,ecx
+ rol eax,5
+ movdqa XMMWORD[rsp],xmm10
+ add ebp,edi
+ and esi,ebx
+ movdqa xmm8,xmm5
+ xor ebx,ecx
+ add ebp,eax
+ ror eax,7
+ movdqa xmm9,xmm5
+ xor esi,ecx
+ pslldq xmm8,12
+ paddd xmm5,xmm5
+ mov edi,ebp
+ add edx,DWORD[24+rsp]
+ psrld xmm9,31
+ xor eax,ebx
+ rol ebp,5
+ add edx,esi
+ movdqa xmm10,xmm8
+ and edi,eax
+ xor eax,ebx
+ psrld xmm8,30
+ add edx,ebp
+ ror ebp,7
+ por xmm5,xmm9
+ xor edi,ebx
+ mov esi,edx
+ add ecx,DWORD[28+rsp]
+ pslld xmm10,2
+ pxor xmm5,xmm8
+ xor ebp,eax
+ movdqa xmm8,XMMWORD[((-32))+r14]
+ rol edx,5
+ add ecx,edi
+ and esi,ebp
+ pxor xmm5,xmm10
+ xor ebp,eax
+ add ecx,edx
+ ror edx,7
+ pshufd xmm6,xmm2,238
+ xor esi,eax
+ movdqa xmm10,xmm5
+ paddd xmm8,xmm5
+ mov edi,ecx
+ add ebx,DWORD[32+rsp]
+ punpcklqdq xmm6,xmm3
+ xor edx,ebp
+ rol ecx,5
+ add ebx,esi
+ psrldq xmm10,4
+ and edi,edx
+ xor edx,ebp
+ pxor xmm6,xmm2
+ add ebx,ecx
+ ror ecx,7
+ pxor xmm10,xmm4
+ xor edi,ebp
+ mov esi,ebx
+ add eax,DWORD[36+rsp]
+ pxor xmm6,xmm10
+ xor ecx,edx
+ rol ebx,5
+ movdqa XMMWORD[16+rsp],xmm8
+ add eax,edi
+ and esi,ecx
+ movdqa xmm9,xmm6
+ xor ecx,edx
+ add eax,ebx
+ ror ebx,7
+ movdqa xmm10,xmm6
+ xor esi,edx
+ pslldq xmm9,12
+ paddd xmm6,xmm6
+ mov edi,eax
+ add ebp,DWORD[40+rsp]
+ psrld xmm10,31
+ xor ebx,ecx
+ rol eax,5
+ add ebp,esi
+ movdqa xmm8,xmm9
+ and edi,ebx
+ xor ebx,ecx
+ psrld xmm9,30
+ add ebp,eax
+ ror eax,7
+ por xmm6,xmm10
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[44+rsp]
+ pslld xmm8,2
+ pxor xmm6,xmm9
+ xor eax,ebx
+ movdqa xmm9,XMMWORD[((-32))+r14]
+ rol ebp,5
+ add edx,edi
+ and esi,eax
+ pxor xmm6,xmm8
+ xor eax,ebx
+ add edx,ebp
+ ror ebp,7
+ pshufd xmm7,xmm3,238
+ xor esi,ebx
+ movdqa xmm8,xmm6
+ paddd xmm9,xmm6
+ mov edi,edx
+ add ecx,DWORD[48+rsp]
+ punpcklqdq xmm7,xmm4
+ xor ebp,eax
+ rol edx,5
+ add ecx,esi
+ psrldq xmm8,4
+ and edi,ebp
+ xor ebp,eax
+ pxor xmm7,xmm3
+ add ecx,edx
+ ror edx,7
+ pxor xmm8,xmm5
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[52+rsp]
+ pxor xmm7,xmm8
+ xor edx,ebp
+ rol ecx,5
+ movdqa XMMWORD[32+rsp],xmm9
+ add ebx,edi
+ and esi,edx
+ movdqa xmm10,xmm7
+ xor edx,ebp
+ add ebx,ecx
+ ror ecx,7
+ movdqa xmm8,xmm7
+ xor esi,ebp
+ pslldq xmm10,12
+ paddd xmm7,xmm7
+ mov edi,ebx
+ add eax,DWORD[56+rsp]
+ psrld xmm8,31
+ xor ecx,edx
+ rol ebx,5
+ add eax,esi
+ movdqa xmm9,xmm10
+ and edi,ecx
+ xor ecx,edx
+ psrld xmm10,30
+ add eax,ebx
+ ror ebx,7
+ por xmm7,xmm8
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[60+rsp]
+ pslld xmm9,2
+ pxor xmm7,xmm10
+ xor ebx,ecx
+ movdqa xmm10,XMMWORD[((-32))+r14]
+ rol eax,5
+ add ebp,edi
+ and esi,ebx
+ pxor xmm7,xmm9
+ pshufd xmm9,xmm6,238
+ xor ebx,ecx
+ add ebp,eax
+ ror eax,7
+ pxor xmm0,xmm4
+ xor esi,ecx
+ mov edi,ebp
+ add edx,DWORD[rsp]
+ punpcklqdq xmm9,xmm7
+ xor eax,ebx
+ rol ebp,5
+ pxor xmm0,xmm1
+ add edx,esi
+ and edi,eax
+ movdqa xmm8,xmm10
+ xor eax,ebx
+ paddd xmm10,xmm7
+ add edx,ebp
+ pxor xmm0,xmm9
+ ror ebp,7
+ xor edi,ebx
+ mov esi,edx
+ add ecx,DWORD[4+rsp]
+ movdqa xmm9,xmm0
+ xor ebp,eax
+ rol edx,5
+ movdqa XMMWORD[48+rsp],xmm10
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ pslld xmm0,2
+ add ecx,edx
+ ror edx,7
+ psrld xmm9,30
+ xor esi,eax
+ mov edi,ecx
+ add ebx,DWORD[8+rsp]
+ por xmm0,xmm9
+ xor edx,ebp
+ rol ecx,5
+ pshufd xmm10,xmm7,238
+ add ebx,esi
+ and edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[12+rsp]
+ xor edi,ebp
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ ror ecx,7
+ add eax,ebx
+ pxor xmm1,xmm5
+ add ebp,DWORD[16+rsp]
+ xor esi,ecx
+ punpcklqdq xmm10,xmm0
+ mov edi,eax
+ rol eax,5
+ pxor xmm1,xmm2
+ add ebp,esi
+ xor edi,ecx
+ movdqa xmm9,xmm8
+ ror ebx,7
+ paddd xmm8,xmm0
+ add ebp,eax
+ pxor xmm1,xmm10
+ add edx,DWORD[20+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ movdqa xmm10,xmm1
+ add edx,edi
+ xor esi,ebx
+ movdqa XMMWORD[rsp],xmm8
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[24+rsp]
+ pslld xmm1,2
+ xor esi,eax
+ mov edi,edx
+ psrld xmm10,30
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ por xmm1,xmm10
+ add ecx,edx
+ add ebx,DWORD[28+rsp]
+ pshufd xmm8,xmm0,238
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ add ebx,ecx
+ pxor xmm2,xmm6
+ add eax,DWORD[32+rsp]
+ xor esi,edx
+ punpcklqdq xmm8,xmm1
+ mov edi,ebx
+ rol ebx,5
+ pxor xmm2,xmm3
+ add eax,esi
+ xor edi,edx
+ movdqa xmm10,XMMWORD[r14]
+ ror ecx,7
+ paddd xmm9,xmm1
+ add eax,ebx
+ pxor xmm2,xmm8
+ add ebp,DWORD[36+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ movdqa xmm8,xmm2
+ add ebp,edi
+ xor esi,ecx
+ movdqa XMMWORD[16+rsp],xmm9
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[40+rsp]
+ pslld xmm2,2
+ xor esi,ebx
+ mov edi,ebp
+ psrld xmm8,30
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ por xmm2,xmm8
+ add edx,ebp
+ add ecx,DWORD[44+rsp]
+ pshufd xmm9,xmm1,238
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ add ecx,edx
+ pxor xmm3,xmm7
+ add ebx,DWORD[48+rsp]
+ xor esi,ebp
+ punpcklqdq xmm9,xmm2
+ mov edi,ecx
+ rol ecx,5
+ pxor xmm3,xmm4
+ add ebx,esi
+ xor edi,ebp
+ movdqa xmm8,xmm10
+ ror edx,7
+ paddd xmm10,xmm2
+ add ebx,ecx
+ pxor xmm3,xmm9
+ add eax,DWORD[52+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ movdqa xmm9,xmm3
+ add eax,edi
+ xor esi,edx
+ movdqa XMMWORD[32+rsp],xmm10
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[56+rsp]
+ pslld xmm3,2
+ xor esi,ecx
+ mov edi,eax
+ psrld xmm9,30
+ rol eax,5
+ add ebp,esi
+ xor edi,ecx
+ ror ebx,7
+ por xmm3,xmm9
+ add ebp,eax
+ add edx,DWORD[60+rsp]
+ pshufd xmm10,xmm2,238
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ add edx,edi
+ xor esi,ebx
+ ror eax,7
+ add edx,ebp
+ pxor xmm4,xmm0
+ add ecx,DWORD[rsp]
+ xor esi,eax
+ punpcklqdq xmm10,xmm3
+ mov edi,edx
+ rol edx,5
+ pxor xmm4,xmm5
+ add ecx,esi
+ xor edi,eax
+ movdqa xmm9,xmm8
+ ror ebp,7
+ paddd xmm8,xmm3
+ add ecx,edx
+ pxor xmm4,xmm10
+ add ebx,DWORD[4+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ movdqa xmm10,xmm4
+ add ebx,edi
+ xor esi,ebp
+ movdqa XMMWORD[48+rsp],xmm8
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[8+rsp]
+ pslld xmm4,2
+ xor esi,edx
+ mov edi,ebx
+ psrld xmm10,30
+ rol ebx,5
+ add eax,esi
+ xor edi,edx
+ ror ecx,7
+ por xmm4,xmm10
+ add eax,ebx
+ add ebp,DWORD[12+rsp]
+ pshufd xmm8,xmm3,238
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ pxor xmm5,xmm1
+ add edx,DWORD[16+rsp]
+ xor esi,ebx
+ punpcklqdq xmm8,xmm4
+ mov edi,ebp
+ rol ebp,5
+ pxor xmm5,xmm6
+ add edx,esi
+ xor edi,ebx
+ movdqa xmm10,xmm9
+ ror eax,7
+ paddd xmm9,xmm4
+ add edx,ebp
+ pxor xmm5,xmm8
+ add ecx,DWORD[20+rsp]
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ movdqa xmm8,xmm5
+ add ecx,edi
+ xor esi,eax
+ movdqa XMMWORD[rsp],xmm9
+ ror ebp,7
+ add ecx,edx
+ add ebx,DWORD[24+rsp]
+ pslld xmm5,2
+ xor esi,ebp
+ mov edi,ecx
+ psrld xmm8,30
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ por xmm5,xmm8
+ add ebx,ecx
+ add eax,DWORD[28+rsp]
+ pshufd xmm9,xmm4,238
+ ror ecx,7
+ mov esi,ebx
+ xor edi,edx
+ rol ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ pxor xmm6,xmm2
+ add ebp,DWORD[32+rsp]
+ and esi,ecx
+ xor ecx,edx
+ ror ebx,7
+ punpcklqdq xmm9,xmm5
+ mov edi,eax
+ xor esi,ecx
+ pxor xmm6,xmm7
+ rol eax,5
+ add ebp,esi
+ movdqa xmm8,xmm10
+ xor edi,ebx
+ paddd xmm10,xmm5
+ xor ebx,ecx
+ pxor xmm6,xmm9
+ add ebp,eax
+ add edx,DWORD[36+rsp]
+ and edi,ebx
+ xor ebx,ecx
+ ror eax,7
+ movdqa xmm9,xmm6
+ mov esi,ebp
+ xor edi,ebx
+ movdqa XMMWORD[16+rsp],xmm10
+ rol ebp,5
+ add edx,edi
+ xor esi,eax
+ pslld xmm6,2
+ xor eax,ebx
+ add edx,ebp
+ psrld xmm9,30
+ add ecx,DWORD[40+rsp]
+ and esi,eax
+ xor eax,ebx
+ por xmm6,xmm9
+ ror ebp,7
+ mov edi,edx
+ xor esi,eax
+ rol edx,5
+ pshufd xmm10,xmm5,238
+ add ecx,esi
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[44+rsp]
+ and edi,ebp
+ xor ebp,eax
+ ror edx,7
+ mov esi,ecx
+ xor edi,ebp
+ rol ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ pxor xmm7,xmm3
+ add eax,DWORD[48+rsp]
+ and esi,edx
+ xor edx,ebp
+ ror ecx,7
+ punpcklqdq xmm10,xmm6
+ mov edi,ebx
+ xor esi,edx
+ pxor xmm7,xmm0
+ rol ebx,5
+ add eax,esi
+ movdqa xmm9,XMMWORD[32+r14]
+ xor edi,ecx
+ paddd xmm8,xmm6
+ xor ecx,edx
+ pxor xmm7,xmm10
+ add eax,ebx
+ add ebp,DWORD[52+rsp]
+ and edi,ecx
+ xor ecx,edx
+ ror ebx,7
+ movdqa xmm10,xmm7
+ mov esi,eax
+ xor edi,ecx
+ movdqa XMMWORD[32+rsp],xmm8
+ rol eax,5
+ add ebp,edi
+ xor esi,ebx
+ pslld xmm7,2
+ xor ebx,ecx
+ add ebp,eax
+ psrld xmm10,30
+ add edx,DWORD[56+rsp]
+ and esi,ebx
+ xor ebx,ecx
+ por xmm7,xmm10
+ ror eax,7
+ mov edi,ebp
+ xor esi,ebx
+ rol ebp,5
+ pshufd xmm8,xmm6,238
+ add edx,esi
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[60+rsp]
+ and edi,eax
+ xor eax,ebx
+ ror ebp,7
+ mov esi,edx
+ xor edi,eax
+ rol edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ pxor xmm0,xmm4
+ add ebx,DWORD[rsp]
+ and esi,ebp
+ xor ebp,eax
+ ror edx,7
+ punpcklqdq xmm8,xmm7
+ mov edi,ecx
+ xor esi,ebp
+ pxor xmm0,xmm1
+ rol ecx,5
+ add ebx,esi
+ movdqa xmm10,xmm9
+ xor edi,edx
+ paddd xmm9,xmm7
+ xor edx,ebp
+ pxor xmm0,xmm8
+ add ebx,ecx
+ add eax,DWORD[4+rsp]
+ and edi,edx
+ xor edx,ebp
+ ror ecx,7
+ movdqa xmm8,xmm0
+ mov esi,ebx
+ xor edi,edx
+ movdqa XMMWORD[48+rsp],xmm9
+ rol ebx,5
+ add eax,edi
+ xor esi,ecx
+ pslld xmm0,2
+ xor ecx,edx
+ add eax,ebx
+ psrld xmm8,30
+ add ebp,DWORD[8+rsp]
+ and esi,ecx
+ xor ecx,edx
+ por xmm0,xmm8
+ ror ebx,7
+ mov edi,eax
+ xor esi,ecx
+ rol eax,5
+ pshufd xmm9,xmm7,238
+ add ebp,esi
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[12+rsp]
+ and edi,ebx
+ xor ebx,ecx
+ ror eax,7
+ mov esi,ebp
+ xor edi,ebx
+ rol ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ pxor xmm1,xmm5
+ add ecx,DWORD[16+rsp]
+ and esi,eax
+ xor eax,ebx
+ ror ebp,7
+ punpcklqdq xmm9,xmm0
+ mov edi,edx
+ xor esi,eax
+ pxor xmm1,xmm2
+ rol edx,5
+ add ecx,esi
+ movdqa xmm8,xmm10
+ xor edi,ebp
+ paddd xmm10,xmm0
+ xor ebp,eax
+ pxor xmm1,xmm9
+ add ecx,edx
+ add ebx,DWORD[20+rsp]
+ and edi,ebp
+ xor ebp,eax
+ ror edx,7
+ movdqa xmm9,xmm1
+ mov esi,ecx
+ xor edi,ebp
+ movdqa XMMWORD[rsp],xmm10
+ rol ecx,5
+ add ebx,edi
+ xor esi,edx
+ pslld xmm1,2
+ xor edx,ebp
+ add ebx,ecx
+ psrld xmm9,30
+ add eax,DWORD[24+rsp]
+ and esi,edx
+ xor edx,ebp
+ por xmm1,xmm9
+ ror ecx,7
+ mov edi,ebx
+ xor esi,edx
+ rol ebx,5
+ pshufd xmm10,xmm0,238
+ add eax,esi
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[28+rsp]
+ and edi,ecx
+ xor ecx,edx
+ ror ebx,7
+ mov esi,eax
+ xor edi,ecx
+ rol eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ pxor xmm2,xmm6
+ add edx,DWORD[32+rsp]
+ and esi,ebx
+ xor ebx,ecx
+ ror eax,7
+ punpcklqdq xmm10,xmm1
+ mov edi,ebp
+ xor esi,ebx
+ pxor xmm2,xmm3
+ rol ebp,5
+ add edx,esi
+ movdqa xmm9,xmm8
+ xor edi,eax
+ paddd xmm8,xmm1
+ xor eax,ebx
+ pxor xmm2,xmm10
+ add edx,ebp
+ add ecx,DWORD[36+rsp]
+ and edi,eax
+ xor eax,ebx
+ ror ebp,7
+ movdqa xmm10,xmm2
+ mov esi,edx
+ xor edi,eax
+ movdqa XMMWORD[16+rsp],xmm8
+ rol edx,5
+ add ecx,edi
+ xor esi,ebp
+ pslld xmm2,2
+ xor ebp,eax
+ add ecx,edx
+ psrld xmm10,30
+ add ebx,DWORD[40+rsp]
+ and esi,ebp
+ xor ebp,eax
+ por xmm2,xmm10
+ ror edx,7
+ mov edi,ecx
+ xor esi,ebp
+ rol ecx,5
+ pshufd xmm8,xmm1,238
+ add ebx,esi
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[44+rsp]
+ and edi,edx
+ xor edx,ebp
+ ror ecx,7
+ mov esi,ebx
+ xor edi,edx
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ add eax,ebx
+ pxor xmm3,xmm7
+ add ebp,DWORD[48+rsp]
+ xor esi,ecx
+ punpcklqdq xmm8,xmm2
+ mov edi,eax
+ rol eax,5
+ pxor xmm3,xmm4
+ add ebp,esi
+ xor edi,ecx
+ movdqa xmm10,xmm9
+ ror ebx,7
+ paddd xmm9,xmm2
+ add ebp,eax
+ pxor xmm3,xmm8
+ add edx,DWORD[52+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ movdqa xmm8,xmm3
+ add edx,edi
+ xor esi,ebx
+ movdqa XMMWORD[32+rsp],xmm9
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[56+rsp]
+ pslld xmm3,2
+ xor esi,eax
+ mov edi,edx
+ psrld xmm8,30
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ por xmm3,xmm8
+ add ecx,edx
+ add ebx,DWORD[60+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[rsp]
+ xor esi,edx
+ mov edi,ebx
+ rol ebx,5
+ paddd xmm10,xmm3
+ add eax,esi
+ xor edi,edx
+ movdqa XMMWORD[48+rsp],xmm10
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[4+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[8+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[12+rsp]
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ add ecx,edx
+ cmp r9,r10
+ je NEAR $L$done_ssse3
+ movdqa xmm6,XMMWORD[64+r14]
+ movdqa xmm9,XMMWORD[((-64))+r14]
+ movdqu xmm0,XMMWORD[r9]
+ movdqu xmm1,XMMWORD[16+r9]
+ movdqu xmm2,XMMWORD[32+r9]
+ movdqu xmm3,XMMWORD[48+r9]
+DB 102,15,56,0,198
+ add r9,64
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ mov edi,ecx
+DB 102,15,56,0,206
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ paddd xmm0,xmm9
+ add ebx,ecx
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ movdqa XMMWORD[rsp],xmm0
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ ror ecx,7
+ psubd xmm0,xmm9
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ rol eax,5
+ add ebp,esi
+ xor edi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ add edx,edi
+ xor esi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ mov edi,edx
+DB 102,15,56,0,214
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ paddd xmm1,xmm9
+ add ecx,edx
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ movdqa XMMWORD[16+rsp],xmm1
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ psubd xmm1,xmm9
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ rol ebx,5
+ add eax,esi
+ xor edi,edx
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ mov edi,ebp
+DB 102,15,56,0,222
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ paddd xmm2,xmm9
+ add edx,ebp
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ movdqa XMMWORD[32+rsp],xmm2
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ psubd xmm2,xmm9
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ ror ecx,7
+ add eax,ebx
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ add edx,DWORD[12+r8]
+ mov DWORD[r8],eax
+ add ebp,DWORD[16+r8]
+ mov DWORD[4+r8],esi
+ mov ebx,esi
+ mov DWORD[8+r8],ecx
+ mov edi,ecx
+ mov DWORD[12+r8],edx
+ xor edi,edx
+ mov DWORD[16+r8],ebp
+ and esi,edi
+ jmp NEAR $L$oop_ssse3
+
+ALIGN 16
+$L$done_ssse3:
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ xor esi,edx
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ rol eax,5
+ add ebp,esi
+ xor edi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ rol ebp,5
+ add edx,edi
+ xor esi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ mov edi,edx
+ rol edx,5
+ add ecx,esi
+ xor edi,eax
+ ror ebp,7
+ add ecx,edx
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ rol ecx,5
+ add ebx,edi
+ xor esi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ rol ebx,5
+ add eax,esi
+ xor edi,edx
+ ror ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ rol eax,5
+ add ebp,edi
+ xor esi,ecx
+ ror ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ rol ebp,5
+ add edx,esi
+ xor edi,ebx
+ ror eax,7
+ add edx,ebp
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ rol edx,5
+ add ecx,edi
+ xor esi,eax
+ ror ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ rol ecx,5
+ add ebx,esi
+ xor edi,ebp
+ ror edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ rol ebx,5
+ add eax,edi
+ ror ecx,7
+ add eax,ebx
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ mov DWORD[r8],eax
+ add edx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ add ebp,DWORD[16+r8]
+ mov DWORD[8+r8],ecx
+ mov DWORD[12+r8],edx
+ mov DWORD[16+r8],ebp
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+
+ mov r13,QWORD[((-32))+r11]
+
+ mov r12,QWORD[((-24))+r11]
+
+ mov rbp,QWORD[((-16))+r11]
+
+ mov rbx,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$epilogue_ssse3:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha1_block_data_order_ssse3:
+global sha1_block_data_order_avx
+
+ALIGN 16
+sha1_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov r11,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ lea rsp,[((-160))+rsp]
+ vzeroupper
+ vmovaps XMMWORD[(-40-96)+r11],xmm6
+ vmovaps XMMWORD[(-40-80)+r11],xmm7
+ vmovaps XMMWORD[(-40-64)+r11],xmm8
+ vmovaps XMMWORD[(-40-48)+r11],xmm9
+ vmovaps XMMWORD[(-40-32)+r11],xmm10
+ vmovaps XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_avx:
+ and rsp,-64
+ mov r8,rdi
+ mov r9,rsi
+ mov r10,rdx
+
+ shl r10,6
+ add r10,r9
+ lea r14,[((K_XX_XX+64))]
+
+ mov eax,DWORD[r8]
+ mov ebx,DWORD[4+r8]
+ mov ecx,DWORD[8+r8]
+ mov edx,DWORD[12+r8]
+ mov esi,ebx
+ mov ebp,DWORD[16+r8]
+ mov edi,ecx
+ xor edi,edx
+ and esi,edi
+
+ vmovdqa xmm6,XMMWORD[64+r14]
+ vmovdqa xmm11,XMMWORD[((-64))+r14]
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ vpshufb xmm0,xmm0,xmm6
+ add r9,64
+ vpshufb xmm1,xmm1,xmm6
+ vpshufb xmm2,xmm2,xmm6
+ vpshufb xmm3,xmm3,xmm6
+ vpaddd xmm4,xmm0,xmm11
+ vpaddd xmm5,xmm1,xmm11
+ vpaddd xmm6,xmm2,xmm11
+ vmovdqa XMMWORD[rsp],xmm4
+ vmovdqa XMMWORD[16+rsp],xmm5
+ vmovdqa XMMWORD[32+rsp],xmm6
+ jmp NEAR $L$oop_avx
+ALIGN 16
+$L$oop_avx:
+ shrd ebx,ebx,2
+ xor esi,edx
+ vpalignr xmm4,xmm1,xmm0,8
+ mov edi,eax
+ add ebp,DWORD[rsp]
+ vpaddd xmm9,xmm11,xmm3
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrldq xmm8,xmm3,4
+ add ebp,esi
+ and edi,ebx
+ vpxor xmm4,xmm4,xmm0
+ xor ebx,ecx
+ add ebp,eax
+ vpxor xmm8,xmm8,xmm2
+ shrd eax,eax,7
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[4+rsp]
+ vpxor xmm4,xmm4,xmm8
+ xor eax,ebx
+ shld ebp,ebp,5
+ vmovdqa XMMWORD[48+rsp],xmm9
+ add edx,edi
+ and esi,eax
+ vpsrld xmm8,xmm4,31
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor esi,ebx
+ vpslldq xmm10,xmm4,12
+ vpaddd xmm4,xmm4,xmm4
+ mov edi,edx
+ add ecx,DWORD[8+rsp]
+ xor ebp,eax
+ shld edx,edx,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm4,xmm4,xmm8
+ add ecx,esi
+ and edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ vpslld xmm10,xmm10,2
+ vpxor xmm4,xmm4,xmm9
+ shrd edx,edx,7
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[12+rsp]
+ vpxor xmm4,xmm4,xmm10
+ xor edx,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ and esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,ebp
+ vpalignr xmm5,xmm2,xmm1,8
+ mov edi,ebx
+ add eax,DWORD[16+rsp]
+ vpaddd xmm9,xmm11,xmm4
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrldq xmm8,xmm4,4
+ add eax,esi
+ and edi,ecx
+ vpxor xmm5,xmm5,xmm1
+ xor ecx,edx
+ add eax,ebx
+ vpxor xmm8,xmm8,xmm3
+ shrd ebx,ebx,7
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[20+rsp]
+ vpxor xmm5,xmm5,xmm8
+ xor ebx,ecx
+ shld eax,eax,5
+ vmovdqa XMMWORD[rsp],xmm9
+ add ebp,edi
+ and esi,ebx
+ vpsrld xmm8,xmm5,31
+ xor ebx,ecx
+ add ebp,eax
+ shrd eax,eax,7
+ xor esi,ecx
+ vpslldq xmm10,xmm5,12
+ vpaddd xmm5,xmm5,xmm5
+ mov edi,ebp
+ add edx,DWORD[24+rsp]
+ xor eax,ebx
+ shld ebp,ebp,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm5,xmm5,xmm8
+ add edx,esi
+ and edi,eax
+ xor eax,ebx
+ add edx,ebp
+ vpslld xmm10,xmm10,2
+ vpxor xmm5,xmm5,xmm9
+ shrd ebp,ebp,7
+ xor edi,ebx
+ mov esi,edx
+ add ecx,DWORD[28+rsp]
+ vpxor xmm5,xmm5,xmm10
+ xor ebp,eax
+ shld edx,edx,5
+ vmovdqa xmm11,XMMWORD[((-32))+r14]
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ vpalignr xmm6,xmm3,xmm2,8
+ mov edi,ecx
+ add ebx,DWORD[32+rsp]
+ vpaddd xmm9,xmm11,xmm5
+ xor edx,ebp
+ shld ecx,ecx,5
+ vpsrldq xmm8,xmm5,4
+ add ebx,esi
+ and edi,edx
+ vpxor xmm6,xmm6,xmm2
+ xor edx,ebp
+ add ebx,ecx
+ vpxor xmm8,xmm8,xmm4
+ shrd ecx,ecx,7
+ xor edi,ebp
+ mov esi,ebx
+ add eax,DWORD[36+rsp]
+ vpxor xmm6,xmm6,xmm8
+ xor ecx,edx
+ shld ebx,ebx,5
+ vmovdqa XMMWORD[16+rsp],xmm9
+ add eax,edi
+ and esi,ecx
+ vpsrld xmm8,xmm6,31
+ xor ecx,edx
+ add eax,ebx
+ shrd ebx,ebx,7
+ xor esi,edx
+ vpslldq xmm10,xmm6,12
+ vpaddd xmm6,xmm6,xmm6
+ mov edi,eax
+ add ebp,DWORD[40+rsp]
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm6,xmm6,xmm8
+ add ebp,esi
+ and edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpslld xmm10,xmm10,2
+ vpxor xmm6,xmm6,xmm9
+ shrd eax,eax,7
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[44+rsp]
+ vpxor xmm6,xmm6,xmm10
+ xor eax,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ and esi,eax
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor esi,ebx
+ vpalignr xmm7,xmm4,xmm3,8
+ mov edi,edx
+ add ecx,DWORD[48+rsp]
+ vpaddd xmm9,xmm11,xmm6
+ xor ebp,eax
+ shld edx,edx,5
+ vpsrldq xmm8,xmm6,4
+ add ecx,esi
+ and edi,ebp
+ vpxor xmm7,xmm7,xmm3
+ xor ebp,eax
+ add ecx,edx
+ vpxor xmm8,xmm8,xmm5
+ shrd edx,edx,7
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[52+rsp]
+ vpxor xmm7,xmm7,xmm8
+ xor edx,ebp
+ shld ecx,ecx,5
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add ebx,edi
+ and esi,edx
+ vpsrld xmm8,xmm7,31
+ xor edx,ebp
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,ebp
+ vpslldq xmm10,xmm7,12
+ vpaddd xmm7,xmm7,xmm7
+ mov edi,ebx
+ add eax,DWORD[56+rsp]
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm7,xmm7,xmm8
+ add eax,esi
+ and edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpslld xmm10,xmm10,2
+ vpxor xmm7,xmm7,xmm9
+ shrd ebx,ebx,7
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[60+rsp]
+ vpxor xmm7,xmm7,xmm10
+ xor ebx,ecx
+ shld eax,eax,5
+ add ebp,edi
+ and esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpalignr xmm8,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ shrd eax,eax,7
+ xor esi,ecx
+ mov edi,ebp
+ add edx,DWORD[rsp]
+ vpxor xmm0,xmm0,xmm1
+ xor eax,ebx
+ shld ebp,ebp,5
+ vpaddd xmm9,xmm11,xmm7
+ add edx,esi
+ and edi,eax
+ vpxor xmm0,xmm0,xmm8
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor edi,ebx
+ vpsrld xmm8,xmm0,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ mov esi,edx
+ add ecx,DWORD[4+rsp]
+ xor ebp,eax
+ shld edx,edx,5
+ vpslld xmm0,xmm0,2
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ mov edi,ecx
+ add ebx,DWORD[8+rsp]
+ vpor xmm0,xmm0,xmm8
+ xor edx,ebp
+ shld ecx,ecx,5
+ add ebx,esi
+ and edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[12+rsp]
+ xor edi,ebp
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpalignr xmm8,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add ebp,DWORD[16+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ vpxor xmm1,xmm1,xmm2
+ add ebp,esi
+ xor edi,ecx
+ vpaddd xmm9,xmm11,xmm0
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpxor xmm1,xmm1,xmm8
+ add edx,DWORD[20+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ vpsrld xmm8,xmm1,30
+ vmovdqa XMMWORD[rsp],xmm9
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpslld xmm1,xmm1,2
+ add ecx,DWORD[24+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpor xmm1,xmm1,xmm8
+ add ebx,DWORD[28+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpalignr xmm8,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add eax,DWORD[32+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ vpxor xmm2,xmm2,xmm3
+ add eax,esi
+ xor edi,edx
+ vpaddd xmm9,xmm11,xmm1
+ vmovdqa xmm11,XMMWORD[r14]
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpxor xmm2,xmm2,xmm8
+ add ebp,DWORD[36+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ vpsrld xmm8,xmm2,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpslld xmm2,xmm2,2
+ add edx,DWORD[40+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpor xmm2,xmm2,xmm8
+ add ecx,DWORD[44+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpalignr xmm8,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add ebx,DWORD[48+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ vpxor xmm3,xmm3,xmm4
+ add ebx,esi
+ xor edi,ebp
+ vpaddd xmm9,xmm11,xmm2
+ shrd edx,edx,7
+ add ebx,ecx
+ vpxor xmm3,xmm3,xmm8
+ add eax,DWORD[52+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ vpsrld xmm8,xmm3,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpslld xmm3,xmm3,2
+ add ebp,DWORD[56+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpor xmm3,xmm3,xmm8
+ add edx,DWORD[60+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpalignr xmm8,xmm3,xmm2,8
+ vpxor xmm4,xmm4,xmm0
+ add ecx,DWORD[rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ vpxor xmm4,xmm4,xmm5
+ add ecx,esi
+ xor edi,eax
+ vpaddd xmm9,xmm11,xmm3
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpxor xmm4,xmm4,xmm8
+ add ebx,DWORD[4+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ vpsrld xmm8,xmm4,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpslld xmm4,xmm4,2
+ add eax,DWORD[8+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpor xmm4,xmm4,xmm8
+ add ebp,DWORD[12+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpalignr xmm8,xmm4,xmm3,8
+ vpxor xmm5,xmm5,xmm1
+ add edx,DWORD[16+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ vpxor xmm5,xmm5,xmm6
+ add edx,esi
+ xor edi,ebx
+ vpaddd xmm9,xmm11,xmm4
+ shrd eax,eax,7
+ add edx,ebp
+ vpxor xmm5,xmm5,xmm8
+ add ecx,DWORD[20+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ vpsrld xmm8,xmm5,30
+ vmovdqa XMMWORD[rsp],xmm9
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpslld xmm5,xmm5,2
+ add ebx,DWORD[24+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpor xmm5,xmm5,xmm8
+ add eax,DWORD[28+rsp]
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpalignr xmm8,xmm5,xmm4,8
+ vpxor xmm6,xmm6,xmm2
+ add ebp,DWORD[32+rsp]
+ and esi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ vpxor xmm6,xmm6,xmm7
+ mov edi,eax
+ xor esi,ecx
+ vpaddd xmm9,xmm11,xmm5
+ shld eax,eax,5
+ add ebp,esi
+ vpxor xmm6,xmm6,xmm8
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[36+rsp]
+ vpsrld xmm8,xmm6,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ and edi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,ebp
+ vpslld xmm6,xmm6,2
+ xor edi,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[40+rsp]
+ and esi,eax
+ vpor xmm6,xmm6,xmm8
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov edi,edx
+ xor esi,eax
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[44+rsp]
+ and edi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ xor edi,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ vpalignr xmm8,xmm6,xmm5,8
+ vpxor xmm7,xmm7,xmm3
+ add eax,DWORD[48+rsp]
+ and esi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ vpxor xmm7,xmm7,xmm0
+ mov edi,ebx
+ xor esi,edx
+ vpaddd xmm9,xmm11,xmm6
+ vmovdqa xmm11,XMMWORD[32+r14]
+ shld ebx,ebx,5
+ add eax,esi
+ vpxor xmm7,xmm7,xmm8
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[52+rsp]
+ vpsrld xmm8,xmm7,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ and edi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ vpslld xmm7,xmm7,2
+ xor edi,ecx
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[56+rsp]
+ and esi,ebx
+ vpor xmm7,xmm7,xmm8
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov edi,ebp
+ xor esi,ebx
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[60+rsp]
+ and edi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov esi,edx
+ xor edi,eax
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ vpalignr xmm8,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ add ebx,DWORD[rsp]
+ and esi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ vpxor xmm0,xmm0,xmm1
+ mov edi,ecx
+ xor esi,ebp
+ vpaddd xmm9,xmm11,xmm7
+ shld ecx,ecx,5
+ add ebx,esi
+ vpxor xmm0,xmm0,xmm8
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[4+rsp]
+ vpsrld xmm8,xmm0,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ and edi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov esi,ebx
+ vpslld xmm0,xmm0,2
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[8+rsp]
+ and esi,ecx
+ vpor xmm0,xmm0,xmm8
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov edi,eax
+ xor esi,ecx
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[12+rsp]
+ and edi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,ebp
+ xor edi,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ vpalignr xmm8,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add ecx,DWORD[16+rsp]
+ and esi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ vpxor xmm1,xmm1,xmm2
+ mov edi,edx
+ xor esi,eax
+ vpaddd xmm9,xmm11,xmm0
+ shld edx,edx,5
+ add ecx,esi
+ vpxor xmm1,xmm1,xmm8
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[20+rsp]
+ vpsrld xmm8,xmm1,30
+ vmovdqa XMMWORD[rsp],xmm9
+ and edi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ vpslld xmm1,xmm1,2
+ xor edi,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[24+rsp]
+ and esi,edx
+ vpor xmm1,xmm1,xmm8
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov edi,ebx
+ xor esi,edx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[28+rsp]
+ and edi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ xor edi,ecx
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpalignr xmm8,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add edx,DWORD[32+rsp]
+ and esi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ vpxor xmm2,xmm2,xmm3
+ mov edi,ebp
+ xor esi,ebx
+ vpaddd xmm9,xmm11,xmm1
+ shld ebp,ebp,5
+ add edx,esi
+ vpxor xmm2,xmm2,xmm8
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[36+rsp]
+ vpsrld xmm8,xmm2,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ and edi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov esi,edx
+ vpslld xmm2,xmm2,2
+ xor edi,eax
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[40+rsp]
+ and esi,ebp
+ vpor xmm2,xmm2,xmm8
+ xor ebp,eax
+ shrd edx,edx,7
+ mov edi,ecx
+ xor esi,ebp
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[44+rsp]
+ and edi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ add eax,ebx
+ vpalignr xmm8,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add ebp,DWORD[48+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ vpxor xmm3,xmm3,xmm4
+ add ebp,esi
+ xor edi,ecx
+ vpaddd xmm9,xmm11,xmm2
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpxor xmm3,xmm3,xmm8
+ add edx,DWORD[52+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ vpsrld xmm8,xmm3,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpslld xmm3,xmm3,2
+ add ecx,DWORD[56+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpor xmm3,xmm3,xmm8
+ add ebx,DWORD[60+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[rsp]
+ vpaddd xmm9,xmm11,xmm3
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ vmovdqa XMMWORD[48+rsp],xmm9
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[4+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[8+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[12+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ cmp r9,r10
+ je NEAR $L$done_avx
+ vmovdqa xmm6,XMMWORD[64+r14]
+ vmovdqa xmm11,XMMWORD[((-64))+r14]
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ vpshufb xmm0,xmm0,xmm6
+ add r9,64
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ vpshufb xmm1,xmm1,xmm6
+ mov edi,ecx
+ shld ecx,ecx,5
+ vpaddd xmm4,xmm0,xmm11
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vmovdqa XMMWORD[rsp],xmm4
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ vpshufb xmm2,xmm2,xmm6
+ mov edi,edx
+ shld edx,edx,5
+ vpaddd xmm5,xmm1,xmm11
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vmovdqa XMMWORD[16+rsp],xmm5
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ vpshufb xmm3,xmm3,xmm6
+ mov edi,ebp
+ shld ebp,ebp,5
+ vpaddd xmm6,xmm2,xmm11
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vmovdqa XMMWORD[32+rsp],xmm6
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ shrd ecx,ecx,7
+ add eax,ebx
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ add edx,DWORD[12+r8]
+ mov DWORD[r8],eax
+ add ebp,DWORD[16+r8]
+ mov DWORD[4+r8],esi
+ mov ebx,esi
+ mov DWORD[8+r8],ecx
+ mov edi,ecx
+ mov DWORD[12+r8],edx
+ xor edi,edx
+ mov DWORD[16+r8],ebp
+ and esi,edi
+ jmp NEAR $L$oop_avx
+
+ALIGN 16
+$L$done_avx:
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ shrd ecx,ecx,7
+ add eax,ebx
+ vzeroupper
+
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ mov DWORD[r8],eax
+ add edx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ add ebp,DWORD[16+r8]
+ mov DWORD[8+r8],ecx
+ mov DWORD[12+r8],edx
+ mov DWORD[16+r8],ebp
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+
+ mov r13,QWORD[((-32))+r11]
+
+ mov r12,QWORD[((-24))+r11]
+
+ mov rbp,QWORD[((-16))+r11]
+
+ mov rbx,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha1_block_data_order_avx:
+global sha1_block_data_order_avx2
+
+ALIGN 16
+sha1_block_data_order_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov r11,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ vzeroupper
+ lea rsp,[((-96))+rsp]
+ vmovaps XMMWORD[(-40-96)+r11],xmm6
+ vmovaps XMMWORD[(-40-80)+r11],xmm7
+ vmovaps XMMWORD[(-40-64)+r11],xmm8
+ vmovaps XMMWORD[(-40-48)+r11],xmm9
+ vmovaps XMMWORD[(-40-32)+r11],xmm10
+ vmovaps XMMWORD[(-40-16)+r11],xmm11
+$L$prologue_avx2:
+ mov r8,rdi
+ mov r9,rsi
+ mov r10,rdx
+
+ lea rsp,[((-640))+rsp]
+ shl r10,6
+ lea r13,[64+r9]
+ and rsp,-128
+ add r10,r9
+ lea r14,[((K_XX_XX+64))]
+
+ mov eax,DWORD[r8]
+ cmp r13,r10
+ cmovae r13,r9
+ mov ebp,DWORD[4+r8]
+ mov ecx,DWORD[8+r8]
+ mov edx,DWORD[12+r8]
+ mov esi,DWORD[16+r8]
+ vmovdqu ymm6,YMMWORD[64+r14]
+
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ lea r9,[64+r9]
+ vinserti128 ymm0,ymm0,XMMWORD[r13],1
+ vinserti128 ymm1,ymm1,XMMWORD[16+r13],1
+ vpshufb ymm0,ymm0,ymm6
+ vinserti128 ymm2,ymm2,XMMWORD[32+r13],1
+ vpshufb ymm1,ymm1,ymm6
+ vinserti128 ymm3,ymm3,XMMWORD[48+r13],1
+ vpshufb ymm2,ymm2,ymm6
+ vmovdqu ymm11,YMMWORD[((-64))+r14]
+ vpshufb ymm3,ymm3,ymm6
+
+ vpaddd ymm4,ymm0,ymm11
+ vpaddd ymm5,ymm1,ymm11
+ vmovdqu YMMWORD[rsp],ymm4
+ vpaddd ymm6,ymm2,ymm11
+ vmovdqu YMMWORD[32+rsp],ymm5
+ vpaddd ymm7,ymm3,ymm11
+ vmovdqu YMMWORD[64+rsp],ymm6
+ vmovdqu YMMWORD[96+rsp],ymm7
+ vpalignr ymm4,ymm1,ymm0,8
+ vpsrldq ymm8,ymm3,4
+ vpxor ymm4,ymm4,ymm0
+ vpxor ymm8,ymm8,ymm2
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm8,ymm4,31
+ vpslldq ymm10,ymm4,12
+ vpaddd ymm4,ymm4,ymm4
+ vpsrld ymm9,ymm10,30
+ vpor ymm4,ymm4,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm4,ymm4,ymm9
+ vpxor ymm4,ymm4,ymm10
+ vpaddd ymm9,ymm4,ymm11
+ vmovdqu YMMWORD[128+rsp],ymm9
+ vpalignr ymm5,ymm2,ymm1,8
+ vpsrldq ymm8,ymm4,4
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm8,ymm8,ymm3
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm5,31
+ vmovdqu ymm11,YMMWORD[((-32))+r14]
+ vpslldq ymm10,ymm5,12
+ vpaddd ymm5,ymm5,ymm5
+ vpsrld ymm9,ymm10,30
+ vpor ymm5,ymm5,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm5,ymm5,ymm10
+ vpaddd ymm9,ymm5,ymm11
+ vmovdqu YMMWORD[160+rsp],ymm9
+ vpalignr ymm6,ymm3,ymm2,8
+ vpsrldq ymm8,ymm5,4
+ vpxor ymm6,ymm6,ymm2
+ vpxor ymm8,ymm8,ymm4
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm6,31
+ vpslldq ymm10,ymm6,12
+ vpaddd ymm6,ymm6,ymm6
+ vpsrld ymm9,ymm10,30
+ vpor ymm6,ymm6,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm6,ymm6,ymm9
+ vpxor ymm6,ymm6,ymm10
+ vpaddd ymm9,ymm6,ymm11
+ vmovdqu YMMWORD[192+rsp],ymm9
+ vpalignr ymm7,ymm4,ymm3,8
+ vpsrldq ymm8,ymm6,4
+ vpxor ymm7,ymm7,ymm3
+ vpxor ymm8,ymm8,ymm5
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm7,31
+ vpslldq ymm10,ymm7,12
+ vpaddd ymm7,ymm7,ymm7
+ vpsrld ymm9,ymm10,30
+ vpor ymm7,ymm7,ymm8
+ vpslld ymm10,ymm10,2
+ vpxor ymm7,ymm7,ymm9
+ vpxor ymm7,ymm7,ymm10
+ vpaddd ymm9,ymm7,ymm11
+ vmovdqu YMMWORD[224+rsp],ymm9
+ lea r13,[128+rsp]
+ jmp NEAR $L$oop_avx2
+ALIGN 32
+$L$oop_avx2:
+ rorx ebx,ebp,2
+ andn edi,ebp,edx
+ and ebp,ecx
+ xor ebp,edi
+ jmp NEAR $L$align32_1
+ALIGN 32
+$L$align32_1:
+ vpalignr ymm8,ymm7,ymm6,8
+ vpxor ymm0,ymm0,ymm4
+ add esi,DWORD[((-128))+r13]
+ andn edi,eax,ecx
+ vpxor ymm0,ymm0,ymm1
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ vpxor ymm0,ymm0,ymm8
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ vpsrld ymm8,ymm0,30
+ vpslld ymm0,ymm0,2
+ add edx,DWORD[((-124))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ vpor ymm0,ymm0,ymm8
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-120))+r13]
+ andn edi,edx,ebp
+ vpaddd ymm9,ymm0,ymm11
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ vmovdqu YMMWORD[256+rsp],ymm9
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-116))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[((-96))+r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ vpalignr ymm8,ymm0,ymm7,8
+ vpxor ymm1,ymm1,ymm5
+ add eax,DWORD[((-92))+r13]
+ andn edi,ebp,edx
+ vpxor ymm1,ymm1,ymm2
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ vpxor ymm1,ymm1,ymm8
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ vpsrld ymm8,ymm1,30
+ vpslld ymm1,ymm1,2
+ add esi,DWORD[((-88))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ vpor ymm1,ymm1,ymm8
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-84))+r13]
+ andn edi,esi,ebx
+ vpaddd ymm9,ymm1,ymm11
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ vmovdqu YMMWORD[288+rsp],ymm9
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-64))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-60))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ vpalignr ymm8,ymm1,ymm0,8
+ vpxor ymm2,ymm2,ymm6
+ add ebp,DWORD[((-56))+r13]
+ andn edi,ebx,esi
+ vpxor ymm2,ymm2,ymm3
+ vmovdqu ymm11,YMMWORD[r14]
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ vpxor ymm2,ymm2,ymm8
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ vpsrld ymm8,ymm2,30
+ vpslld ymm2,ymm2,2
+ add eax,DWORD[((-52))+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ vpor ymm2,ymm2,ymm8
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[((-32))+r13]
+ andn edi,eax,ecx
+ vpaddd ymm9,ymm2,ymm11
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ vmovdqu YMMWORD[320+rsp],ymm9
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-28))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-24))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ vpalignr ymm8,ymm2,ymm1,8
+ vpxor ymm3,ymm3,ymm7
+ add ebx,DWORD[((-20))+r13]
+ andn edi,ecx,eax
+ vpxor ymm3,ymm3,ymm4
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ vpxor ymm3,ymm3,ymm8
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ vpsrld ymm8,ymm3,30
+ vpslld ymm3,ymm3,2
+ add ebp,DWORD[r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ vpor ymm3,ymm3,ymm8
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[4+r13]
+ andn edi,ebp,edx
+ vpaddd ymm9,ymm3,ymm11
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ vmovdqu YMMWORD[352+rsp],ymm9
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[8+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[12+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ vpalignr ymm8,ymm3,ymm2,8
+ vpxor ymm4,ymm4,ymm0
+ add ecx,DWORD[32+r13]
+ lea ecx,[rsi*1+rcx]
+ vpxor ymm4,ymm4,ymm5
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ vpxor ymm4,ymm4,ymm8
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[36+r13]
+ vpsrld ymm8,ymm4,30
+ vpslld ymm4,ymm4,2
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ vpor ymm4,ymm4,ymm8
+ add ebp,DWORD[40+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ vpaddd ymm9,ymm4,ymm11
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[44+r13]
+ vmovdqu YMMWORD[384+rsp],ymm9
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[64+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vpalignr ymm8,ymm4,ymm3,8
+ vpxor ymm5,ymm5,ymm1
+ add edx,DWORD[68+r13]
+ lea edx,[rax*1+rdx]
+ vpxor ymm5,ymm5,ymm6
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ vpxor ymm5,ymm5,ymm8
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[72+r13]
+ vpsrld ymm8,ymm5,30
+ vpslld ymm5,ymm5,2
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ vpor ymm5,ymm5,ymm8
+ add ebx,DWORD[76+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ vpaddd ymm9,ymm5,ymm11
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[96+r13]
+ vmovdqu YMMWORD[416+rsp],ymm9
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[100+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpalignr ymm8,ymm5,ymm4,8
+ vpxor ymm6,ymm6,ymm2
+ add esi,DWORD[104+r13]
+ lea esi,[rbp*1+rsi]
+ vpxor ymm6,ymm6,ymm7
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ vpxor ymm6,ymm6,ymm8
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[108+r13]
+ lea r13,[256+r13]
+ vpsrld ymm8,ymm6,30
+ vpslld ymm6,ymm6,2
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ vpor ymm6,ymm6,ymm8
+ add ecx,DWORD[((-128))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ vpaddd ymm9,ymm6,ymm11
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-124))+r13]
+ vmovdqu YMMWORD[448+rsp],ymm9
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-120))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vpalignr ymm8,ymm6,ymm5,8
+ vpxor ymm7,ymm7,ymm3
+ add eax,DWORD[((-116))+r13]
+ lea eax,[rbx*1+rax]
+ vpxor ymm7,ymm7,ymm0
+ vmovdqu ymm11,YMMWORD[32+r14]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ vpxor ymm7,ymm7,ymm8
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-96))+r13]
+ vpsrld ymm8,ymm7,30
+ vpslld ymm7,ymm7,2
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vpor ymm7,ymm7,ymm8
+ add edx,DWORD[((-92))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpaddd ymm9,ymm7,ymm11
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-88))+r13]
+ vmovdqu YMMWORD[480+rsp],ymm9
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-84))+r13]
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ jmp NEAR $L$align32_2
+ALIGN 32
+$L$align32_2:
+ vpalignr ymm8,ymm7,ymm6,8
+ vpxor ymm0,ymm0,ymm4
+ add ebp,DWORD[((-64))+r13]
+ xor ecx,esi
+ vpxor ymm0,ymm0,ymm1
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ vpxor ymm0,ymm0,ymm8
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ vpsrld ymm8,ymm0,30
+ vpslld ymm0,ymm0,2
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-60))+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ vpor ymm0,ymm0,ymm8
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ vpaddd ymm9,ymm0,ymm11
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[((-56))+r13]
+ xor ebp,ecx
+ vmovdqu YMMWORD[512+rsp],ymm9
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[((-52))+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ add ecx,DWORD[((-32))+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ and edx,edi
+ vpalignr ymm8,ymm0,ymm7,8
+ vpxor ymm1,ymm1,ymm5
+ add ebx,DWORD[((-28))+r13]
+ xor edx,eax
+ vpxor ymm1,ymm1,ymm2
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ vpxor ymm1,ymm1,ymm8
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ vpsrld ymm8,ymm1,30
+ vpslld ymm1,ymm1,2
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[((-24))+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ vpor ymm1,ymm1,ymm8
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ vpaddd ymm9,ymm1,ymm11
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-20))+r13]
+ xor ebx,edx
+ vmovdqu YMMWORD[544+rsp],ymm9
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[4+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ vpalignr ymm8,ymm1,ymm0,8
+ vpxor ymm2,ymm2,ymm6
+ add ecx,DWORD[8+r13]
+ xor esi,ebp
+ vpxor ymm2,ymm2,ymm3
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ vpxor ymm2,ymm2,ymm8
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ vpsrld ymm8,ymm2,30
+ vpslld ymm2,ymm2,2
+ add ecx,r12d
+ and edx,edi
+ add ebx,DWORD[12+r13]
+ xor edx,eax
+ mov edi,esi
+ xor edi,eax
+ vpor ymm2,ymm2,ymm8
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ vpaddd ymm9,ymm2,ymm11
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[32+r13]
+ xor ecx,esi
+ vmovdqu YMMWORD[576+rsp],ymm9
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[36+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[40+r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ vpalignr ymm8,ymm2,ymm1,8
+ vpxor ymm3,ymm3,ymm7
+ add edx,DWORD[44+r13]
+ xor eax,ebx
+ vpxor ymm3,ymm3,ymm4
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ vpxor ymm3,ymm3,ymm8
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ vpsrld ymm8,ymm3,30
+ vpslld ymm3,ymm3,2
+ add edx,r12d
+ and esi,edi
+ add ecx,DWORD[64+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ vpor ymm3,ymm3,ymm8
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ vpaddd ymm9,ymm3,ymm11
+ add ecx,r12d
+ and edx,edi
+ add ebx,DWORD[68+r13]
+ xor edx,eax
+ vmovdqu YMMWORD[608+rsp],ymm9
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[72+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[76+r13]
+ xor ebx,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[96+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[100+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[104+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[108+r13]
+ lea r13,[256+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-128))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-124))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-120))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-116))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-96))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-92))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-88))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-84))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-64))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-60))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-56))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-52))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-32))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-28))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-24))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-20))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ add edx,r12d
+ lea r13,[128+r9]
+ lea rdi,[128+r9]
+ cmp r13,r10
+ cmovae r13,r9
+
+
+ add edx,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ebp,DWORD[8+r8]
+ mov DWORD[r8],edx
+ add ebx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ mov eax,edx
+ add ecx,DWORD[16+r8]
+ mov r12d,ebp
+ mov DWORD[8+r8],ebp
+ mov edx,ebx
+
+ mov DWORD[12+r8],ebx
+ mov ebp,esi
+ mov DWORD[16+r8],ecx
+
+ mov esi,ecx
+ mov ecx,r12d
+
+
+ cmp r9,r10
+ je NEAR $L$done_avx2
+ vmovdqu ymm6,YMMWORD[64+r14]
+ cmp rdi,r10
+ ja NEAR $L$ast_avx2
+
+ vmovdqu xmm0,XMMWORD[((-64))+rdi]
+ vmovdqu xmm1,XMMWORD[((-48))+rdi]
+ vmovdqu xmm2,XMMWORD[((-32))+rdi]
+ vmovdqu xmm3,XMMWORD[((-16))+rdi]
+ vinserti128 ymm0,ymm0,XMMWORD[r13],1
+ vinserti128 ymm1,ymm1,XMMWORD[16+r13],1
+ vinserti128 ymm2,ymm2,XMMWORD[32+r13],1
+ vinserti128 ymm3,ymm3,XMMWORD[48+r13],1
+ jmp NEAR $L$ast_avx2
+
+ALIGN 32
+$L$ast_avx2:
+ lea r13,[((128+16))+rsp]
+ rorx ebx,ebp,2
+ andn edi,ebp,edx
+ and ebp,ecx
+ xor ebp,edi
+ sub r9,-128
+ add esi,DWORD[((-128))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-124))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-120))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-116))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[((-96))+r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[((-92))+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[((-88))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-84))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-64))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-60))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[((-56))+r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[((-52))+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[((-32))+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[((-28))+r13]
+ andn edi,esi,ebx
+ add edx,eax
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ and esi,ebp
+ add edx,r12d
+ xor esi,edi
+ add ecx,DWORD[((-24))+r13]
+ andn edi,edx,ebp
+ add ecx,esi
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ and edx,eax
+ add ecx,r12d
+ xor edx,edi
+ add ebx,DWORD[((-20))+r13]
+ andn edi,ecx,eax
+ add ebx,edx
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ and ecx,esi
+ add ebx,r12d
+ xor ecx,edi
+ add ebp,DWORD[r13]
+ andn edi,ebx,esi
+ add ebp,ecx
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ and ebx,edx
+ add ebp,r12d
+ xor ebx,edi
+ add eax,DWORD[4+r13]
+ andn edi,ebp,edx
+ add eax,ebx
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ and ebp,ecx
+ add eax,r12d
+ xor ebp,edi
+ add esi,DWORD[8+r13]
+ andn edi,eax,ecx
+ add esi,ebp
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ and eax,ebx
+ add esi,r12d
+ xor eax,edi
+ add edx,DWORD[12+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[32+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[36+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[40+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[44+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[64+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vmovdqu ymm11,YMMWORD[((-64))+r14]
+ vpshufb ymm0,ymm0,ymm6
+ add edx,DWORD[68+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[72+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[76+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[96+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[100+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpshufb ymm1,ymm1,ymm6
+ vpaddd ymm8,ymm0,ymm11
+ add esi,DWORD[104+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[108+r13]
+ lea r13,[256+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-128))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-124))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-120))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vmovdqu YMMWORD[rsp],ymm8
+ vpshufb ymm2,ymm2,ymm6
+ vpaddd ymm9,ymm1,ymm11
+ add eax,DWORD[((-116))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-96))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-92))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ add ecx,DWORD[((-88))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-84))+r13]
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ vmovdqu YMMWORD[32+rsp],ymm9
+ vpshufb ymm3,ymm3,ymm6
+ vpaddd ymm6,ymm2,ymm11
+ add ebp,DWORD[((-64))+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-60))+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[((-56))+r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[((-52))+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ add ecx,DWORD[((-32))+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ and edx,edi
+ jmp NEAR $L$align32_3
+ALIGN 32
+$L$align32_3:
+ vmovdqu YMMWORD[64+rsp],ymm6
+ vpaddd ymm7,ymm3,ymm11
+ add ebx,DWORD[((-28))+r13]
+ xor edx,eax
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[((-24))+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[((-20))+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ add edx,DWORD[4+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ and esi,edi
+ vmovdqu YMMWORD[96+rsp],ymm7
+ add ecx,DWORD[8+r13]
+ xor esi,ebp
+ mov edi,eax
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ and edx,edi
+ add ebx,DWORD[12+r13]
+ xor edx,eax
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[32+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[36+r13]
+ xor ebx,edx
+ mov edi,ecx
+ xor edi,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ and ebp,edi
+ add esi,DWORD[40+r13]
+ xor ebp,ecx
+ mov edi,ebx
+ xor edi,ecx
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ and eax,edi
+ vpalignr ymm4,ymm1,ymm0,8
+ add edx,DWORD[44+r13]
+ xor eax,ebx
+ mov edi,ebp
+ xor edi,ebx
+ vpsrldq ymm8,ymm3,4
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpxor ymm4,ymm4,ymm0
+ vpxor ymm8,ymm8,ymm2
+ xor esi,ebp
+ add edx,r12d
+ vpxor ymm4,ymm4,ymm8
+ and esi,edi
+ add ecx,DWORD[64+r13]
+ xor esi,ebp
+ mov edi,eax
+ vpsrld ymm8,ymm4,31
+ xor edi,ebp
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ vpslldq ymm10,ymm4,12
+ vpaddd ymm4,ymm4,ymm4
+ rorx esi,edx,2
+ xor edx,eax
+ vpsrld ymm9,ymm10,30
+ vpor ymm4,ymm4,ymm8
+ add ecx,r12d
+ and edx,edi
+ vpslld ymm10,ymm10,2
+ vpxor ymm4,ymm4,ymm9
+ add ebx,DWORD[68+r13]
+ xor edx,eax
+ vpxor ymm4,ymm4,ymm10
+ mov edi,esi
+ xor edi,eax
+ lea ebx,[rdx*1+rbx]
+ vpaddd ymm9,ymm4,ymm11
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ vmovdqu YMMWORD[128+rsp],ymm9
+ add ebx,r12d
+ and ecx,edi
+ add ebp,DWORD[72+r13]
+ xor ecx,esi
+ mov edi,edx
+ xor edi,esi
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ and ebx,edi
+ add eax,DWORD[76+r13]
+ xor ebx,edx
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpalignr ymm5,ymm2,ymm1,8
+ add esi,DWORD[96+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ vpsrldq ymm8,ymm4,4
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ vpxor ymm5,ymm5,ymm1
+ vpxor ymm8,ymm8,ymm3
+ add edx,DWORD[100+r13]
+ lea edx,[rax*1+rdx]
+ vpxor ymm5,ymm5,ymm8
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ xor esi,ebp
+ add edx,r12d
+ vpsrld ymm8,ymm5,31
+ vmovdqu ymm11,YMMWORD[((-32))+r14]
+ xor esi,ebx
+ add ecx,DWORD[104+r13]
+ lea ecx,[rsi*1+rcx]
+ vpslldq ymm10,ymm5,12
+ vpaddd ymm5,ymm5,ymm5
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ vpsrld ymm9,ymm10,30
+ vpor ymm5,ymm5,ymm8
+ xor edx,eax
+ add ecx,r12d
+ vpslld ymm10,ymm10,2
+ vpxor ymm5,ymm5,ymm9
+ xor edx,ebp
+ add ebx,DWORD[108+r13]
+ lea r13,[256+r13]
+ vpxor ymm5,ymm5,ymm10
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ vpaddd ymm9,ymm5,ymm11
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ vmovdqu YMMWORD[160+rsp],ymm9
+ add ebp,DWORD[((-128))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vpalignr ymm6,ymm3,ymm2,8
+ add eax,DWORD[((-124))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ vpsrldq ymm8,ymm5,4
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ vpxor ymm6,ymm6,ymm2
+ vpxor ymm8,ymm8,ymm4
+ add esi,DWORD[((-120))+r13]
+ lea esi,[rbp*1+rsi]
+ vpxor ymm6,ymm6,ymm8
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ vpsrld ymm8,ymm6,31
+ xor eax,ecx
+ add edx,DWORD[((-116))+r13]
+ lea edx,[rax*1+rdx]
+ vpslldq ymm10,ymm6,12
+ vpaddd ymm6,ymm6,ymm6
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpsrld ymm9,ymm10,30
+ vpor ymm6,ymm6,ymm8
+ xor esi,ebp
+ add edx,r12d
+ vpslld ymm10,ymm10,2
+ vpxor ymm6,ymm6,ymm9
+ xor esi,ebx
+ add ecx,DWORD[((-96))+r13]
+ vpxor ymm6,ymm6,ymm10
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ vpaddd ymm9,ymm6,ymm11
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ vmovdqu YMMWORD[192+rsp],ymm9
+ add ebx,DWORD[((-92))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ vpalignr ymm7,ymm4,ymm3,8
+ add ebp,DWORD[((-88))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ vpsrldq ymm8,ymm6,4
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ vpxor ymm7,ymm7,ymm3
+ vpxor ymm8,ymm8,ymm5
+ add eax,DWORD[((-84))+r13]
+ lea eax,[rbx*1+rax]
+ vpxor ymm7,ymm7,ymm8
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ vpsrld ymm8,ymm7,31
+ xor ebp,edx
+ add esi,DWORD[((-64))+r13]
+ lea esi,[rbp*1+rsi]
+ vpslldq ymm10,ymm7,12
+ vpaddd ymm7,ymm7,ymm7
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ vpsrld ymm9,ymm10,30
+ vpor ymm7,ymm7,ymm8
+ xor eax,ebx
+ add esi,r12d
+ vpslld ymm10,ymm10,2
+ vpxor ymm7,ymm7,ymm9
+ xor eax,ecx
+ add edx,DWORD[((-60))+r13]
+ vpxor ymm7,ymm7,ymm10
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ rorx eax,esi,2
+ vpaddd ymm9,ymm7,ymm11
+ xor esi,ebp
+ add edx,r12d
+ xor esi,ebx
+ vmovdqu YMMWORD[224+rsp],ymm9
+ add ecx,DWORD[((-56))+r13]
+ lea ecx,[rsi*1+rcx]
+ rorx r12d,edx,27
+ rorx esi,edx,2
+ xor edx,eax
+ add ecx,r12d
+ xor edx,ebp
+ add ebx,DWORD[((-52))+r13]
+ lea ebx,[rdx*1+rbx]
+ rorx r12d,ecx,27
+ rorx edx,ecx,2
+ xor ecx,esi
+ add ebx,r12d
+ xor ecx,eax
+ add ebp,DWORD[((-32))+r13]
+ lea ebp,[rbp*1+rcx]
+ rorx r12d,ebx,27
+ rorx ecx,ebx,2
+ xor ebx,edx
+ add ebp,r12d
+ xor ebx,esi
+ add eax,DWORD[((-28))+r13]
+ lea eax,[rbx*1+rax]
+ rorx r12d,ebp,27
+ rorx ebx,ebp,2
+ xor ebp,ecx
+ add eax,r12d
+ xor ebp,edx
+ add esi,DWORD[((-24))+r13]
+ lea esi,[rbp*1+rsi]
+ rorx r12d,eax,27
+ rorx ebp,eax,2
+ xor eax,ebx
+ add esi,r12d
+ xor eax,ecx
+ add edx,DWORD[((-20))+r13]
+ lea edx,[rax*1+rdx]
+ rorx r12d,esi,27
+ add edx,r12d
+ lea r13,[128+rsp]
+
+
+ add edx,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ebp,DWORD[8+r8]
+ mov DWORD[r8],edx
+ add ebx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ mov eax,edx
+ add ecx,DWORD[16+r8]
+ mov r12d,ebp
+ mov DWORD[8+r8],ebp
+ mov edx,ebx
+
+ mov DWORD[12+r8],ebx
+ mov ebp,esi
+ mov DWORD[16+r8],ecx
+
+ mov esi,ecx
+ mov ecx,r12d
+
+
+ cmp r9,r10
+ jbe NEAR $L$oop_avx2
+
+$L$done_avx2:
+ vzeroupper
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+
+ mov r13,QWORD[((-32))+r11]
+
+ mov r12,QWORD[((-24))+r11]
+
+ mov rbp,QWORD[((-16))+r11]
+
+ mov rbx,QWORD[((-8))+r11]
+
+ lea rsp,[r11]
+
+$L$epilogue_avx2:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha1_block_data_order_avx2:
+section .rdata rdata align=8
+ALIGN 64
+K_XX_XX:
+ DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+ DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+ DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+ DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+ DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+ DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+ DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+ DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DB 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+ DB 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+ DB 102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44
+ DB 32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60
+ DB 97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114
+ DB 103,62,0
+ALIGN 64
+section .text
+
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov rax,QWORD[64+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+
+ jmp NEAR $L$common_seh_tail
+
+
+ALIGN 16
+shaext_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue_shaext]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ lea r10,[$L$epilogue_shaext]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-8-64))+rax]
+ lea rdi,[512+r8]
+ mov ecx,8
+ DD 0xa548f3fc
+
+ jmp NEAR $L$common_seh_tail
+
+
+ALIGN 16
+ssse3_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[208+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-40-96))+rax]
+ lea rdi,[512+r8]
+ mov ecx,12
+ DD 0xa548f3fc
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha1_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_hw wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_hw wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_hw wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_avx2 wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_avx2 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha1_block_data_order_nohw:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_hw:
+ DB 9,0,0,0
+ DD shaext_handler wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_ssse3:
+ DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_avx:
+ DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_avx2:
+ DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha256-586-apple.S b/gen/bcm/sha256-586-apple.S
new file mode 100644
index 0000000..8e74e68
--- /dev/null
+++ b/gen/bcm/sha256-586-apple.S
@@ -0,0 +1,5593 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _sha256_block_data_order_nohw
+.private_extern _sha256_block_data_order_nohw
+.align 4
+_sha256_block_data_order_nohw:
+L_sha256_block_data_order_nohw_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call L000pic_point
+L000pic_point:
+ popl %ebp
+ leal LK256-L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+L001no_xmm:
+ subl %edi,%eax
+ cmpl $256,%eax
+ jae L002unrolled
+ jmp L003loop
+.align 4,0x90
+L003loop:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ bswap %eax
+ movl 12(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ bswap %eax
+ movl 28(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %eax
+ movl 44(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ bswap %eax
+ movl 60(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ addl $64,%edi
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
+.align 4,0x90
+L00400_15:
+ movl %edx,%ecx
+ movl 24(%esp),%esi
+ rorl $14,%ecx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl 96(%esp),%ebx
+ rorl $5,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ rorl $11,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3248222580,%esi
+ jne L00400_15
+ movl 156(%esp),%ecx
+ jmp L00516_63
+.align 4,0x90
+L00516_63:
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
+ rorl $14,%ecx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl %ebx,96(%esp)
+ rorl $5,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ rorl $11,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3329325298,%esi
+ jne L00516_63
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
+ addl 16(%esi),%edx
+ addl 20(%esi),%eax
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %eax,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ leal 356(%esp),%esp
+ subl $256,%ebp
+ cmpl 8(%esp),%edi
+ jb L003loop
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 6,0x90
+LK256:
+.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long 66051,67438087,134810123,202182159
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+.align 4,0x90
+L002unrolled:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebp
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebx
+ movl %ebp,4(%esp)
+ xorl %ecx,%ebp
+ movl %ecx,8(%esp)
+ movl %ebx,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ jmp L006grand_loop
+.align 4,0x90
+L006grand_loop:
+ movl (%edi),%ebx
+ movl 4(%edi),%ecx
+ bswap %ebx
+ movl 8(%edi),%esi
+ bswap %ecx
+ movl %ebx,32(%esp)
+ bswap %esi
+ movl %ecx,36(%esp)
+ movl %esi,40(%esp)
+ movl 12(%edi),%ebx
+ movl 16(%edi),%ecx
+ bswap %ebx
+ movl 20(%edi),%esi
+ bswap %ecx
+ movl %ebx,44(%esp)
+ bswap %esi
+ movl %ecx,48(%esp)
+ movl %esi,52(%esp)
+ movl 24(%edi),%ebx
+ movl 28(%edi),%ecx
+ bswap %ebx
+ movl 32(%edi),%esi
+ bswap %ecx
+ movl %ebx,56(%esp)
+ bswap %esi
+ movl %ecx,60(%esp)
+ movl %esi,64(%esp)
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %ebx
+ movl 44(%edi),%esi
+ bswap %ecx
+ movl %ebx,68(%esp)
+ bswap %esi
+ movl %ecx,72(%esp)
+ movl %esi,76(%esp)
+ movl 48(%edi),%ebx
+ movl 52(%edi),%ecx
+ bswap %ebx
+ movl 56(%edi),%esi
+ bswap %ecx
+ movl %ebx,80(%esp)
+ bswap %esi
+ movl %ecx,84(%esp)
+ movl %esi,88(%esp)
+ movl 60(%edi),%ebx
+ addl $64,%edi
+ bswap %ebx
+ movl %edi,100(%esp)
+ movl %ebx,92(%esp)
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1116352408(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 36(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1899447441(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 40(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3049323471(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 44(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3921009573(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 48(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 961987163(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 52(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1508970993(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 56(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2453635748(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 60(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2870763221(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 64(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3624381080(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 68(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 310598401(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 72(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 607225278(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 76(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1426881987(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 80(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1925078388(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 84(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2162078206(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 88(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2614888103(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 92(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3248222580(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3835390401(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 4022224774(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 264347078(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 604807628(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 770255983(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1249150122(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1555081692(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1996064986(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2554220882(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2821834349(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2952996808(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3210313671(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3336571891(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3584528711(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 113926993(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 338241895(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 666307205(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 773529912(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1294757372(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1396182291(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1695183700(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1986661051(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2177026350(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2456956037(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2730485921(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2820302411(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3259730800(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3345764771(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3516065817(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3600352804(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 4094571909(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 275423344(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 430227734(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 506948616(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 659060556(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 883997877(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 958139571(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1322822218(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1537002063(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1747873779(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1955562222(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2024104815(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2227730452(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2361852424(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2428436474(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2756734187(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3204031479(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3329325298(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebp
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebp
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebp,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ cmpl 104(%esp),%edi
+ jb L006grand_loop
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _sha256_block_data_order_ssse3
+.private_extern _sha256_block_data_order_ssse3
+.align 4
+_sha256_block_data_order_ssse3:
+L_sha256_block_data_order_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call L007pic_point
+L007pic_point:
+ popl %ebp
+ leal LK256-L007pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ movdqa 256(%ebp),%xmm7
+ jmp L008grand_ssse3
+.align 4,0x90
+L008grand_ssse3:
+ movdqu (%edi),%xmm0
+ movdqu 16(%edi),%xmm1
+ movdqu 32(%edi),%xmm2
+ movdqu 48(%edi),%xmm3
+ addl $64,%edi
+.byte 102,15,56,0,199
+ movl %edi,100(%esp)
+.byte 102,15,56,0,207
+ movdqa (%ebp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 16(%ebp),%xmm5
+ paddd %xmm0,%xmm4
+.byte 102,15,56,0,223
+ movdqa 32(%ebp),%xmm6
+ paddd %xmm1,%xmm5
+ movdqa 48(%ebp),%xmm7
+ movdqa %xmm4,32(%esp)
+ paddd %xmm2,%xmm6
+ movdqa %xmm5,48(%esp)
+ paddd %xmm3,%xmm7
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm7,80(%esp)
+ jmp L009ssse3_00_47
+.align 4,0x90
+L009ssse3_00_47:
+ addl $64,%ebp
+ movl %edx,%ecx
+ movdqa %xmm1,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,224,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,250,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm3,%xmm7
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm0
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm0
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm0,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa (%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm0,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,32(%esp)
+ movl %edx,%ecx
+ movdqa %xmm2,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,225,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,251,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm0,%xmm7
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm1
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm1
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm1,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 16(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm1,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,48(%esp)
+ movl %edx,%ecx
+ movdqa %xmm3,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,226,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,248,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm1,%xmm7
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm2
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm2
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm2,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 32(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm2,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,64(%esp)
+ movl %edx,%ecx
+ movdqa %xmm0,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,227,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,249,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm2,%xmm7
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm3
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm3
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm3,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 48(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm3,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne L009ssse3_00_47
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ movdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb L008grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _sha256_block_data_order_avx
+.private_extern _sha256_block_data_order_avx
+.align 4
+_sha256_block_data_order_avx:
+L_sha256_block_data_order_avx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call L010pic_point
+L010pic_point:
+ popl %ebp
+ leal LK256-L010pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp L011grand_avx
+.align 5,0x90
+L011grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp L012avx_00_47
+.align 4,0x90
+L012avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne L012avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb L011grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/sha256-586-linux.S b/gen/bcm/sha256-586-linux.S
new file mode 100644
index 0000000..41b3759
--- /dev/null
+++ b/gen/bcm/sha256-586-linux.S
@@ -0,0 +1,5599 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl sha256_block_data_order_nohw
+.hidden sha256_block_data_order_nohw
+.type sha256_block_data_order_nohw,@function
+.align 16
+sha256_block_data_order_nohw:
+.L_sha256_block_data_order_nohw_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal .LK256-.L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+.L001no_xmm:
+ subl %edi,%eax
+ cmpl $256,%eax
+ jae .L002unrolled
+ jmp .L003loop
+.align 16
+.L003loop:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ bswap %eax
+ movl 12(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ bswap %eax
+ movl 28(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %eax
+ movl 44(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ bswap %eax
+ movl 60(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ addl $64,%edi
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
+.align 16
+.L00400_15:
+ movl %edx,%ecx
+ movl 24(%esp),%esi
+ rorl $14,%ecx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl 96(%esp),%ebx
+ rorl $5,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ rorl $11,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3248222580,%esi
+ jne .L00400_15
+ movl 156(%esp),%ecx
+ jmp .L00516_63
+.align 16
+.L00516_63:
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
+ rorl $14,%ecx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl %ebx,96(%esp)
+ rorl $5,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ rorl $11,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3329325298,%esi
+ jne .L00516_63
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
+ addl 16(%esi),%edx
+ addl 20(%esi),%eax
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %eax,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ leal 356(%esp),%esp
+ subl $256,%ebp
+ cmpl 8(%esp),%edi
+ jb .L003loop
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.LK256:
+.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long 66051,67438087,134810123,202182159
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+.align 16
+.L002unrolled:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebp
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebx
+ movl %ebp,4(%esp)
+ xorl %ecx,%ebp
+ movl %ecx,8(%esp)
+ movl %ebx,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ jmp .L006grand_loop
+.align 16
+.L006grand_loop:
+ movl (%edi),%ebx
+ movl 4(%edi),%ecx
+ bswap %ebx
+ movl 8(%edi),%esi
+ bswap %ecx
+ movl %ebx,32(%esp)
+ bswap %esi
+ movl %ecx,36(%esp)
+ movl %esi,40(%esp)
+ movl 12(%edi),%ebx
+ movl 16(%edi),%ecx
+ bswap %ebx
+ movl 20(%edi),%esi
+ bswap %ecx
+ movl %ebx,44(%esp)
+ bswap %esi
+ movl %ecx,48(%esp)
+ movl %esi,52(%esp)
+ movl 24(%edi),%ebx
+ movl 28(%edi),%ecx
+ bswap %ebx
+ movl 32(%edi),%esi
+ bswap %ecx
+ movl %ebx,56(%esp)
+ bswap %esi
+ movl %ecx,60(%esp)
+ movl %esi,64(%esp)
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %ebx
+ movl 44(%edi),%esi
+ bswap %ecx
+ movl %ebx,68(%esp)
+ bswap %esi
+ movl %ecx,72(%esp)
+ movl %esi,76(%esp)
+ movl 48(%edi),%ebx
+ movl 52(%edi),%ecx
+ bswap %ebx
+ movl 56(%edi),%esi
+ bswap %ecx
+ movl %ebx,80(%esp)
+ bswap %esi
+ movl %ecx,84(%esp)
+ movl %esi,88(%esp)
+ movl 60(%edi),%ebx
+ addl $64,%edi
+ bswap %ebx
+ movl %edi,100(%esp)
+ movl %ebx,92(%esp)
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1116352408(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 36(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1899447441(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 40(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3049323471(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 44(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3921009573(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 48(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 961987163(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 52(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1508970993(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 56(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2453635748(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 60(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2870763221(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 64(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3624381080(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 68(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 310598401(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 72(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 607225278(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 76(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1426881987(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 80(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1925078388(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 84(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2162078206(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 88(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2614888103(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 92(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3248222580(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3835390401(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 4022224774(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 264347078(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 604807628(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 770255983(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1249150122(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1555081692(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1996064986(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2554220882(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2821834349(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2952996808(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3210313671(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3336571891(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3584528711(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 113926993(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 338241895(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 666307205(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 773529912(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1294757372(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1396182291(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1695183700(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1986661051(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2177026350(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2456956037(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2730485921(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2820302411(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3259730800(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3345764771(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3516065817(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3600352804(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 4094571909(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 275423344(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 430227734(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 506948616(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 659060556(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 883997877(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 958139571(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1322822218(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1537002063(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1747873779(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1955562222(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2024104815(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2227730452(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2361852424(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2428436474(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2756734187(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3204031479(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3329325298(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebp
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebp
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebp,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ cmpl 104(%esp),%edi
+ jb .L006grand_loop
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha256_block_data_order_nohw,.-.L_sha256_block_data_order_nohw_begin
+.globl sha256_block_data_order_ssse3
+.hidden sha256_block_data_order_ssse3
+.type sha256_block_data_order_ssse3,@function
+.align 16
+sha256_block_data_order_ssse3:
+.L_sha256_block_data_order_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L007pic_point
+.L007pic_point:
+ popl %ebp
+ leal .LK256-.L007pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ movdqa 256(%ebp),%xmm7
+ jmp .L008grand_ssse3
+.align 16
+.L008grand_ssse3:
+ movdqu (%edi),%xmm0
+ movdqu 16(%edi),%xmm1
+ movdqu 32(%edi),%xmm2
+ movdqu 48(%edi),%xmm3
+ addl $64,%edi
+.byte 102,15,56,0,199
+ movl %edi,100(%esp)
+.byte 102,15,56,0,207
+ movdqa (%ebp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 16(%ebp),%xmm5
+ paddd %xmm0,%xmm4
+.byte 102,15,56,0,223
+ movdqa 32(%ebp),%xmm6
+ paddd %xmm1,%xmm5
+ movdqa 48(%ebp),%xmm7
+ movdqa %xmm4,32(%esp)
+ paddd %xmm2,%xmm6
+ movdqa %xmm5,48(%esp)
+ paddd %xmm3,%xmm7
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm7,80(%esp)
+ jmp .L009ssse3_00_47
+.align 16
+.L009ssse3_00_47:
+ addl $64,%ebp
+ movl %edx,%ecx
+ movdqa %xmm1,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,224,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,250,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm3,%xmm7
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm0
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm0
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm0,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa (%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm0,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,32(%esp)
+ movl %edx,%ecx
+ movdqa %xmm2,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,225,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,251,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm0,%xmm7
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm1
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm1
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm1,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 16(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm1,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,48(%esp)
+ movl %edx,%ecx
+ movdqa %xmm3,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,226,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,248,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm1,%xmm7
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm2
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm2
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm2,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 32(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm2,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,64(%esp)
+ movl %edx,%ecx
+ movdqa %xmm0,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,227,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,249,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm2,%xmm7
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm3
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm3
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm3,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 48(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm3,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L009ssse3_00_47
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ movdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L008grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha256_block_data_order_ssse3,.-.L_sha256_block_data_order_ssse3_begin
+.globl sha256_block_data_order_avx
+.hidden sha256_block_data_order_avx
+.type sha256_block_data_order_avx,@function
+.align 16
+sha256_block_data_order_avx:
+.L_sha256_block_data_order_avx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L010pic_point
+.L010pic_point:
+ popl %ebp
+ leal .LK256-.L010pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L011grand_avx
+.align 32
+.L011grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L012avx_00_47
+.align 16
+.L012avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L012avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L011grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha256_block_data_order_avx,.-.L_sha256_block_data_order_avx_begin
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/sha256-586-win.asm b/gen/bcm/sha256-586-win.asm
new file mode 100644
index 0000000..0ef244d
--- /dev/null
+++ b/gen/bcm/sha256-586-win.asm
@@ -0,0 +1,5601 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _sha256_block_data_order_nohw
+align 16
+_sha256_block_data_order_nohw:
+L$_sha256_block_data_order_nohw_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov ebx,esp
+ call L$000pic_point
+L$000pic_point:
+ pop ebp
+ lea ebp,[(L$K256-L$000pic_point)+ebp]
+ sub esp,16
+ and esp,-64
+ shl eax,6
+ add eax,edi
+ mov DWORD [esp],esi
+ mov DWORD [4+esp],edi
+ mov DWORD [8+esp],eax
+ mov DWORD [12+esp],ebx
+L$001no_xmm:
+ sub eax,edi
+ cmp eax,256
+ jae NEAR L$002unrolled
+ jmp NEAR L$003loop
+align 16
+L$003loop:
+ mov eax,DWORD [edi]
+ mov ebx,DWORD [4+edi]
+ mov ecx,DWORD [8+edi]
+ bswap eax
+ mov edx,DWORD [12+edi]
+ bswap ebx
+ push eax
+ bswap ecx
+ push ebx
+ bswap edx
+ push ecx
+ push edx
+ mov eax,DWORD [16+edi]
+ mov ebx,DWORD [20+edi]
+ mov ecx,DWORD [24+edi]
+ bswap eax
+ mov edx,DWORD [28+edi]
+ bswap ebx
+ push eax
+ bswap ecx
+ push ebx
+ bswap edx
+ push ecx
+ push edx
+ mov eax,DWORD [32+edi]
+ mov ebx,DWORD [36+edi]
+ mov ecx,DWORD [40+edi]
+ bswap eax
+ mov edx,DWORD [44+edi]
+ bswap ebx
+ push eax
+ bswap ecx
+ push ebx
+ bswap edx
+ push ecx
+ push edx
+ mov eax,DWORD [48+edi]
+ mov ebx,DWORD [52+edi]
+ mov ecx,DWORD [56+edi]
+ bswap eax
+ mov edx,DWORD [60+edi]
+ bswap ebx
+ push eax
+ bswap ecx
+ push ebx
+ bswap edx
+ push ecx
+ push edx
+ add edi,64
+ lea esp,[esp-36]
+ mov DWORD [104+esp],edi
+ mov eax,DWORD [esi]
+ mov ebx,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov edi,DWORD [12+esi]
+ mov DWORD [8+esp],ebx
+ xor ebx,ecx
+ mov DWORD [12+esp],ecx
+ mov DWORD [16+esp],edi
+ mov DWORD [esp],ebx
+ mov edx,DWORD [16+esi]
+ mov ebx,DWORD [20+esi]
+ mov ecx,DWORD [24+esi]
+ mov edi,DWORD [28+esi]
+ mov DWORD [24+esp],ebx
+ mov DWORD [28+esp],ecx
+ mov DWORD [32+esp],edi
+align 16
+L$00400_15:
+ mov ecx,edx
+ mov esi,DWORD [24+esp]
+ ror ecx,14
+ mov edi,DWORD [28+esp]
+ xor ecx,edx
+ xor esi,edi
+ mov ebx,DWORD [96+esp]
+ ror ecx,5
+ and esi,edx
+ mov DWORD [20+esp],edx
+ xor edx,ecx
+ add ebx,DWORD [32+esp]
+ xor esi,edi
+ ror edx,6
+ mov ecx,eax
+ add ebx,esi
+ ror ecx,9
+ add ebx,edx
+ mov edi,DWORD [8+esp]
+ xor ecx,eax
+ mov DWORD [4+esp],eax
+ lea esp,[esp-4]
+ ror ecx,11
+ mov esi,DWORD [ebp]
+ xor ecx,eax
+ mov edx,DWORD [20+esp]
+ xor eax,edi
+ ror ecx,2
+ add ebx,esi
+ mov DWORD [esp],eax
+ add edx,ebx
+ and eax,DWORD [4+esp]
+ add ebx,ecx
+ xor eax,edi
+ add ebp,4
+ add eax,ebx
+ cmp esi,3248222580
+ jne NEAR L$00400_15
+ mov ecx,DWORD [156+esp]
+ jmp NEAR L$00516_63
+align 16
+L$00516_63:
+ mov ebx,ecx
+ mov esi,DWORD [104+esp]
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [160+esp]
+ shr edi,10
+ add ebx,DWORD [124+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [24+esp]
+ ror ecx,14
+ add ebx,edi
+ mov edi,DWORD [28+esp]
+ xor ecx,edx
+ xor esi,edi
+ mov DWORD [96+esp],ebx
+ ror ecx,5
+ and esi,edx
+ mov DWORD [20+esp],edx
+ xor edx,ecx
+ add ebx,DWORD [32+esp]
+ xor esi,edi
+ ror edx,6
+ mov ecx,eax
+ add ebx,esi
+ ror ecx,9
+ add ebx,edx
+ mov edi,DWORD [8+esp]
+ xor ecx,eax
+ mov DWORD [4+esp],eax
+ lea esp,[esp-4]
+ ror ecx,11
+ mov esi,DWORD [ebp]
+ xor ecx,eax
+ mov edx,DWORD [20+esp]
+ xor eax,edi
+ ror ecx,2
+ add ebx,esi
+ mov DWORD [esp],eax
+ add edx,ebx
+ and eax,DWORD [4+esp]
+ add ebx,ecx
+ xor eax,edi
+ mov ecx,DWORD [156+esp]
+ add ebp,4
+ add eax,ebx
+ cmp esi,3329325298
+ jne NEAR L$00516_63
+ mov esi,DWORD [356+esp]
+ mov ebx,DWORD [8+esp]
+ mov ecx,DWORD [16+esp]
+ add eax,DWORD [esi]
+ add ebx,DWORD [4+esi]
+ add edi,DWORD [8+esi]
+ add ecx,DWORD [12+esi]
+ mov DWORD [esi],eax
+ mov DWORD [4+esi],ebx
+ mov DWORD [8+esi],edi
+ mov DWORD [12+esi],ecx
+ mov eax,DWORD [24+esp]
+ mov ebx,DWORD [28+esp]
+ mov ecx,DWORD [32+esp]
+ mov edi,DWORD [360+esp]
+ add edx,DWORD [16+esi]
+ add eax,DWORD [20+esi]
+ add ebx,DWORD [24+esi]
+ add ecx,DWORD [28+esi]
+ mov DWORD [16+esi],edx
+ mov DWORD [20+esi],eax
+ mov DWORD [24+esi],ebx
+ mov DWORD [28+esi],ecx
+ lea esp,[356+esp]
+ sub ebp,256
+ cmp edi,DWORD [8+esp]
+ jb NEAR L$003loop
+ mov esp,DWORD [12+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 64
+L$K256:
+dd 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+dd 66051,67438087,134810123,202182159
+db 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+db 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+db 62,0
+align 16
+L$002unrolled:
+ lea esp,[esp-96]
+ mov eax,DWORD [esi]
+ mov ebp,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov ebx,DWORD [12+esi]
+ mov DWORD [4+esp],ebp
+ xor ebp,ecx
+ mov DWORD [8+esp],ecx
+ mov DWORD [12+esp],ebx
+ mov edx,DWORD [16+esi]
+ mov ebx,DWORD [20+esi]
+ mov ecx,DWORD [24+esi]
+ mov esi,DWORD [28+esi]
+ mov DWORD [20+esp],ebx
+ mov DWORD [24+esp],ecx
+ mov DWORD [28+esp],esi
+ jmp NEAR L$006grand_loop
+align 16
+L$006grand_loop:
+ mov ebx,DWORD [edi]
+ mov ecx,DWORD [4+edi]
+ bswap ebx
+ mov esi,DWORD [8+edi]
+ bswap ecx
+ mov DWORD [32+esp],ebx
+ bswap esi
+ mov DWORD [36+esp],ecx
+ mov DWORD [40+esp],esi
+ mov ebx,DWORD [12+edi]
+ mov ecx,DWORD [16+edi]
+ bswap ebx
+ mov esi,DWORD [20+edi]
+ bswap ecx
+ mov DWORD [44+esp],ebx
+ bswap esi
+ mov DWORD [48+esp],ecx
+ mov DWORD [52+esp],esi
+ mov ebx,DWORD [24+edi]
+ mov ecx,DWORD [28+edi]
+ bswap ebx
+ mov esi,DWORD [32+edi]
+ bswap ecx
+ mov DWORD [56+esp],ebx
+ bswap esi
+ mov DWORD [60+esp],ecx
+ mov DWORD [64+esp],esi
+ mov ebx,DWORD [36+edi]
+ mov ecx,DWORD [40+edi]
+ bswap ebx
+ mov esi,DWORD [44+edi]
+ bswap ecx
+ mov DWORD [68+esp],ebx
+ bswap esi
+ mov DWORD [72+esp],ecx
+ mov DWORD [76+esp],esi
+ mov ebx,DWORD [48+edi]
+ mov ecx,DWORD [52+edi]
+ bswap ebx
+ mov esi,DWORD [56+edi]
+ bswap ecx
+ mov DWORD [80+esp],ebx
+ bswap esi
+ mov DWORD [84+esp],ecx
+ mov DWORD [88+esp],esi
+ mov ebx,DWORD [60+edi]
+ add edi,64
+ bswap ebx
+ mov DWORD [100+esp],edi
+ mov DWORD [92+esp],ebx
+ mov ecx,edx
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov ebx,DWORD [32+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[1116352408+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov ebx,DWORD [36+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1899447441+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov ecx,edx
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov ebx,DWORD [40+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[3049323471+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov ebx,DWORD [44+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[3921009573+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov ecx,edx
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov ebx,DWORD [48+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[961987163+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov ebx,DWORD [52+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1508970993+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov ecx,edx
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ mov ebx,DWORD [56+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2453635748+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ mov ebx,DWORD [60+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2870763221+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov ecx,edx
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov ebx,DWORD [64+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[3624381080+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov ebx,DWORD [68+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[310598401+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov ecx,edx
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov ebx,DWORD [72+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[607225278+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov ebx,DWORD [76+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1426881987+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov ecx,edx
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov ebx,DWORD [80+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[1925078388+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov ebx,DWORD [84+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2162078206+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov ecx,edx
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ mov ebx,DWORD [88+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2614888103+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov esi,edx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ mov ebx,DWORD [92+esp]
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[3248222580+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [36+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov esi,DWORD [88+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [32+esp]
+ shr edi,10
+ add ebx,DWORD [68+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov DWORD [32+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[3835390401+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [40+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov ecx,DWORD [92+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [36+esp]
+ shr edi,10
+ add ebx,DWORD [72+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov DWORD [36+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[4022224774+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [44+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov esi,DWORD [32+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [40+esp]
+ shr edi,10
+ add ebx,DWORD [76+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov DWORD [40+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[264347078+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [48+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov ecx,DWORD [36+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [44+esp]
+ shr edi,10
+ add ebx,DWORD [80+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov DWORD [44+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[604807628+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [52+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov esi,DWORD [40+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [48+esp]
+ shr edi,10
+ add ebx,DWORD [84+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov DWORD [48+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[770255983+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [56+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov ecx,DWORD [44+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [52+esp]
+ shr edi,10
+ add ebx,DWORD [88+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov DWORD [52+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1249150122+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [60+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov esi,DWORD [48+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [56+esp]
+ shr edi,10
+ add ebx,DWORD [92+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ mov DWORD [56+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[1555081692+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [64+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov ecx,DWORD [52+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [60+esp]
+ shr edi,10
+ add ebx,DWORD [32+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ mov DWORD [60+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1996064986+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [68+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov esi,DWORD [56+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [64+esp]
+ shr edi,10
+ add ebx,DWORD [36+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov DWORD [64+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2554220882+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [72+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov ecx,DWORD [60+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [68+esp]
+ shr edi,10
+ add ebx,DWORD [40+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov DWORD [68+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2821834349+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [76+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov esi,DWORD [64+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [72+esp]
+ shr edi,10
+ add ebx,DWORD [44+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov DWORD [72+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2952996808+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [80+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov ecx,DWORD [68+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [76+esp]
+ shr edi,10
+ add ebx,DWORD [48+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov DWORD [76+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[3210313671+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [84+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov esi,DWORD [72+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [80+esp]
+ shr edi,10
+ add ebx,DWORD [52+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov DWORD [80+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[3336571891+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [88+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov ecx,DWORD [76+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [84+esp]
+ shr edi,10
+ add ebx,DWORD [56+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov DWORD [84+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[3584528711+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [92+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov esi,DWORD [80+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [88+esp]
+ shr edi,10
+ add ebx,DWORD [60+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ mov DWORD [88+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[113926993+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [32+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov ecx,DWORD [84+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [92+esp]
+ shr edi,10
+ add ebx,DWORD [64+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ mov DWORD [92+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[338241895+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [36+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov esi,DWORD [88+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [32+esp]
+ shr edi,10
+ add ebx,DWORD [68+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov DWORD [32+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[666307205+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [40+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov ecx,DWORD [92+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [36+esp]
+ shr edi,10
+ add ebx,DWORD [72+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov DWORD [36+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[773529912+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [44+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov esi,DWORD [32+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [40+esp]
+ shr edi,10
+ add ebx,DWORD [76+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov DWORD [40+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[1294757372+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [48+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov ecx,DWORD [36+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [44+esp]
+ shr edi,10
+ add ebx,DWORD [80+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov DWORD [44+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1396182291+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [52+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov esi,DWORD [40+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [48+esp]
+ shr edi,10
+ add ebx,DWORD [84+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov DWORD [48+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[1695183700+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [56+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov ecx,DWORD [44+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [52+esp]
+ shr edi,10
+ add ebx,DWORD [88+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov DWORD [52+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1986661051+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [60+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov esi,DWORD [48+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [56+esp]
+ shr edi,10
+ add ebx,DWORD [92+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ mov DWORD [56+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2177026350+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [64+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov ecx,DWORD [52+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [60+esp]
+ shr edi,10
+ add ebx,DWORD [32+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ mov DWORD [60+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2456956037+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [68+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov esi,DWORD [56+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [64+esp]
+ shr edi,10
+ add ebx,DWORD [36+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov DWORD [64+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2730485921+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [72+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov ecx,DWORD [60+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [68+esp]
+ shr edi,10
+ add ebx,DWORD [40+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov DWORD [68+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2820302411+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [76+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov esi,DWORD [64+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [72+esp]
+ shr edi,10
+ add ebx,DWORD [44+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov DWORD [72+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[3259730800+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [80+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov ecx,DWORD [68+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [76+esp]
+ shr edi,10
+ add ebx,DWORD [48+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov DWORD [76+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[3345764771+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [84+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov esi,DWORD [72+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [80+esp]
+ shr edi,10
+ add ebx,DWORD [52+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov DWORD [80+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[3516065817+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [88+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov ecx,DWORD [76+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [84+esp]
+ shr edi,10
+ add ebx,DWORD [56+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov DWORD [84+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[3600352804+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [92+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov esi,DWORD [80+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [88+esp]
+ shr edi,10
+ add ebx,DWORD [60+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ mov DWORD [88+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[4094571909+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [32+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov ecx,DWORD [84+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [92+esp]
+ shr edi,10
+ add ebx,DWORD [64+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ mov DWORD [92+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[275423344+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [36+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov esi,DWORD [88+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [32+esp]
+ shr edi,10
+ add ebx,DWORD [68+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov DWORD [32+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[430227734+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [40+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov ecx,DWORD [92+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [36+esp]
+ shr edi,10
+ add ebx,DWORD [72+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov DWORD [36+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[506948616+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [44+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov esi,DWORD [32+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [40+esp]
+ shr edi,10
+ add ebx,DWORD [76+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov DWORD [40+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[659060556+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [48+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov ecx,DWORD [36+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [44+esp]
+ shr edi,10
+ add ebx,DWORD [80+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov DWORD [44+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[883997877+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [52+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov esi,DWORD [40+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [48+esp]
+ shr edi,10
+ add ebx,DWORD [84+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov DWORD [48+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[958139571+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [56+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov ecx,DWORD [44+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [52+esp]
+ shr edi,10
+ add ebx,DWORD [88+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov DWORD [52+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1322822218+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [60+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov esi,DWORD [48+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [56+esp]
+ shr edi,10
+ add ebx,DWORD [92+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ mov DWORD [56+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[1537002063+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [64+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov ecx,DWORD [52+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [60+esp]
+ shr edi,10
+ add ebx,DWORD [32+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ mov DWORD [60+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[1747873779+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [68+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov esi,DWORD [56+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [64+esp]
+ shr edi,10
+ add ebx,DWORD [36+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [20+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [24+esp]
+ xor edx,ecx
+ mov DWORD [64+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [28+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [4+esp]
+ xor ecx,eax
+ mov DWORD [esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[1955562222+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [72+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [12+esp]
+ add ebp,ecx
+ mov ecx,DWORD [60+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [68+esp]
+ shr edi,10
+ add ebx,DWORD [40+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [16+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [20+esp]
+ xor edx,esi
+ mov DWORD [68+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [12+esp],esi
+ xor edx,esi
+ add ebx,DWORD [24+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [esp]
+ xor esi,ebp
+ mov DWORD [28+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2024104815+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [76+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,esi
+ mov esi,DWORD [64+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [72+esp]
+ shr edi,10
+ add ebx,DWORD [44+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [12+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [16+esp]
+ xor edx,ecx
+ mov DWORD [72+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [20+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [28+esp]
+ xor ecx,eax
+ mov DWORD [24+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2227730452+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [80+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [4+esp]
+ add ebp,ecx
+ mov ecx,DWORD [68+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [76+esp]
+ shr edi,10
+ add ebx,DWORD [48+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [8+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [12+esp]
+ xor edx,esi
+ mov DWORD [76+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [4+esp],esi
+ xor edx,esi
+ add ebx,DWORD [16+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [24+esp]
+ xor esi,ebp
+ mov DWORD [20+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2361852424+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [84+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,esi
+ mov esi,DWORD [72+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [80+esp]
+ shr edi,10
+ add ebx,DWORD [52+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [4+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [8+esp]
+ xor edx,ecx
+ mov DWORD [80+esp],ebx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [12+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [20+esp]
+ xor ecx,eax
+ mov DWORD [16+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[2428436474+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [88+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [28+esp]
+ add ebp,ecx
+ mov ecx,DWORD [76+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [84+esp]
+ shr edi,10
+ add ebx,DWORD [56+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [4+esp]
+ xor edx,esi
+ mov DWORD [84+esp],ebx
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [28+esp],esi
+ xor edx,esi
+ add ebx,DWORD [8+esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [16+esp]
+ xor esi,ebp
+ mov DWORD [12+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[2756734187+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ mov ecx,DWORD [92+esp]
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,esi
+ mov esi,DWORD [80+esp]
+ mov ebx,ecx
+ ror ecx,11
+ mov edi,esi
+ ror esi,2
+ xor ecx,ebx
+ shr ebx,3
+ ror ecx,7
+ xor esi,edi
+ xor ebx,ecx
+ ror esi,17
+ add ebx,DWORD [88+esp]
+ shr edi,10
+ add ebx,DWORD [60+esp]
+ mov ecx,edx
+ xor edi,esi
+ mov esi,DWORD [28+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [esp]
+ xor edx,ecx
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ add ebx,DWORD [4+esp]
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add ebx,edi
+ ror ecx,9
+ mov esi,eax
+ mov edi,DWORD [12+esp]
+ xor ecx,eax
+ mov DWORD [8+esp],eax
+ xor eax,edi
+ ror ecx,11
+ and ebp,eax
+ lea edx,[3204031479+edx*1+ebx]
+ xor ecx,esi
+ xor ebp,edi
+ mov esi,DWORD [32+esp]
+ ror ecx,2
+ add ebp,edx
+ add edx,DWORD [20+esp]
+ add ebp,ecx
+ mov ecx,DWORD [84+esp]
+ mov ebx,esi
+ ror esi,11
+ mov edi,ecx
+ ror ecx,2
+ xor esi,ebx
+ shr ebx,3
+ ror esi,7
+ xor ecx,edi
+ xor ebx,esi
+ ror ecx,17
+ add ebx,DWORD [92+esp]
+ shr edi,10
+ add ebx,DWORD [64+esp]
+ mov esi,edx
+ xor edi,ecx
+ mov ecx,DWORD [24+esp]
+ ror edx,14
+ add ebx,edi
+ mov edi,DWORD [28+esp]
+ xor edx,esi
+ xor ecx,edi
+ ror edx,5
+ and ecx,esi
+ mov DWORD [20+esp],esi
+ xor edx,esi
+ add ebx,DWORD [esp]
+ xor edi,ecx
+ ror edx,6
+ mov esi,ebp
+ add ebx,edi
+ ror esi,9
+ mov ecx,ebp
+ mov edi,DWORD [8+esp]
+ xor esi,ebp
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ ror esi,11
+ and eax,ebp
+ lea edx,[3329325298+edx*1+ebx]
+ xor esi,ecx
+ xor eax,edi
+ ror esi,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,esi
+ mov esi,DWORD [96+esp]
+ xor ebp,edi
+ mov ecx,DWORD [12+esp]
+ add eax,DWORD [esi]
+ add ebp,DWORD [4+esi]
+ add edi,DWORD [8+esi]
+ add ecx,DWORD [12+esi]
+ mov DWORD [esi],eax
+ mov DWORD [4+esi],ebp
+ mov DWORD [8+esi],edi
+ mov DWORD [12+esi],ecx
+ mov DWORD [4+esp],ebp
+ xor ebp,edi
+ mov DWORD [8+esp],edi
+ mov DWORD [12+esp],ecx
+ mov edi,DWORD [20+esp]
+ mov ebx,DWORD [24+esp]
+ mov ecx,DWORD [28+esp]
+ add edx,DWORD [16+esi]
+ add edi,DWORD [20+esi]
+ add ebx,DWORD [24+esi]
+ add ecx,DWORD [28+esi]
+ mov DWORD [16+esi],edx
+ mov DWORD [20+esi],edi
+ mov DWORD [24+esi],ebx
+ mov DWORD [28+esi],ecx
+ mov DWORD [20+esp],edi
+ mov edi,DWORD [100+esp]
+ mov DWORD [24+esp],ebx
+ mov DWORD [28+esp],ecx
+ cmp edi,DWORD [104+esp]
+ jb NEAR L$006grand_loop
+ mov esp,DWORD [108+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _sha256_block_data_order_ssse3
+align 16
+_sha256_block_data_order_ssse3:
+L$_sha256_block_data_order_ssse3_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov ebx,esp
+ call L$007pic_point
+L$007pic_point:
+ pop ebp
+ lea ebp,[(L$K256-L$007pic_point)+ebp]
+ sub esp,16
+ and esp,-64
+ shl eax,6
+ add eax,edi
+ mov DWORD [esp],esi
+ mov DWORD [4+esp],edi
+ mov DWORD [8+esp],eax
+ mov DWORD [12+esp],ebx
+ lea esp,[esp-96]
+ mov eax,DWORD [esi]
+ mov ebx,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov edi,DWORD [12+esi]
+ mov DWORD [4+esp],ebx
+ xor ebx,ecx
+ mov DWORD [8+esp],ecx
+ mov DWORD [12+esp],edi
+ mov edx,DWORD [16+esi]
+ mov edi,DWORD [20+esi]
+ mov ecx,DWORD [24+esi]
+ mov esi,DWORD [28+esi]
+ mov DWORD [20+esp],edi
+ mov edi,DWORD [100+esp]
+ mov DWORD [24+esp],ecx
+ mov DWORD [28+esp],esi
+ movdqa xmm7,[256+ebp]
+ jmp NEAR L$008grand_ssse3
+align 16
+L$008grand_ssse3:
+ movdqu xmm0,[edi]
+ movdqu xmm1,[16+edi]
+ movdqu xmm2,[32+edi]
+ movdqu xmm3,[48+edi]
+ add edi,64
+db 102,15,56,0,199
+ mov DWORD [100+esp],edi
+db 102,15,56,0,207
+ movdqa xmm4,[ebp]
+db 102,15,56,0,215
+ movdqa xmm5,[16+ebp]
+ paddd xmm4,xmm0
+db 102,15,56,0,223
+ movdqa xmm6,[32+ebp]
+ paddd xmm5,xmm1
+ movdqa xmm7,[48+ebp]
+ movdqa [32+esp],xmm4
+ paddd xmm6,xmm2
+ movdqa [48+esp],xmm5
+ paddd xmm7,xmm3
+ movdqa [64+esp],xmm6
+ movdqa [80+esp],xmm7
+ jmp NEAR L$009ssse3_00_47
+align 16
+L$009ssse3_00_47:
+ add ebp,64
+ mov ecx,edx
+ movdqa xmm4,xmm1
+ ror edx,14
+ mov esi,DWORD [20+esp]
+ movdqa xmm7,xmm3
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+db 102,15,58,15,224,4
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+db 102,15,58,15,250,4
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ movdqa xmm5,xmm4
+ ror edx,6
+ mov ecx,eax
+ movdqa xmm6,xmm4
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ psrld xmm4,3
+ mov esi,eax
+ ror ecx,9
+ paddd xmm0,xmm7
+ mov DWORD [esp],eax
+ xor ecx,eax
+ psrld xmm6,7
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ ror ecx,11
+ and ebx,eax
+ pshufd xmm7,xmm3,250
+ xor ecx,esi
+ add edx,DWORD [32+esp]
+ pslld xmm5,14
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm4,xmm6
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ psrld xmm6,11
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm4,xmm5
+ mov esi,DWORD [16+esp]
+ xor edx,ecx
+ pslld xmm5,11
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ ror edx,5
+ pxor xmm4,xmm6
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ movdqa xmm6,xmm7
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ pxor xmm4,xmm5
+ mov ecx,ebx
+ add edx,edi
+ psrld xmm7,10
+ mov edi,DWORD [esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm0,xmm4
+ mov DWORD [28+esp],ebx
+ xor ecx,ebx
+ psrlq xmm6,17
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ ror ecx,11
+ pxor xmm7,xmm6
+ and eax,ebx
+ xor ecx,esi
+ psrlq xmm6,2
+ add edx,DWORD [36+esp]
+ xor eax,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add eax,edx
+ add edx,DWORD [8+esp]
+ pshufd xmm7,xmm7,128
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [12+esp]
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ psrldq xmm7,8
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ paddd xmm0,xmm7
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [24+esp],eax
+ pshufd xmm7,xmm0,80
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ movdqa xmm6,xmm7
+ ror ecx,11
+ psrld xmm7,10
+ and ebx,eax
+ psrlq xmm6,17
+ xor ecx,esi
+ add edx,DWORD [40+esp]
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ psrlq xmm6,2
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm7,xmm6
+ mov esi,DWORD [8+esp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ pshufd xmm7,xmm7,8
+ xor esi,edi
+ ror edx,5
+ movdqa xmm6,[ebp]
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ pslldq xmm7,8
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm0,xmm7
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ paddd xmm6,xmm0
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [44+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ movdqa [32+esp],xmm6
+ mov ecx,edx
+ movdqa xmm4,xmm2
+ ror edx,14
+ mov esi,DWORD [4+esp]
+ movdqa xmm7,xmm0
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+db 102,15,58,15,225,4
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+db 102,15,58,15,251,4
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ movdqa xmm5,xmm4
+ ror edx,6
+ mov ecx,eax
+ movdqa xmm6,xmm4
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ psrld xmm4,3
+ mov esi,eax
+ ror ecx,9
+ paddd xmm1,xmm7
+ mov DWORD [16+esp],eax
+ xor ecx,eax
+ psrld xmm6,7
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ ror ecx,11
+ and ebx,eax
+ pshufd xmm7,xmm0,250
+ xor ecx,esi
+ add edx,DWORD [48+esp]
+ pslld xmm5,14
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm4,xmm6
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ psrld xmm6,11
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm4,xmm5
+ mov esi,DWORD [esp]
+ xor edx,ecx
+ pslld xmm5,11
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ ror edx,5
+ pxor xmm4,xmm6
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ movdqa xmm6,xmm7
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ pxor xmm4,xmm5
+ mov ecx,ebx
+ add edx,edi
+ psrld xmm7,10
+ mov edi,DWORD [16+esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm1,xmm4
+ mov DWORD [12+esp],ebx
+ xor ecx,ebx
+ psrlq xmm6,17
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ ror ecx,11
+ pxor xmm7,xmm6
+ and eax,ebx
+ xor ecx,esi
+ psrlq xmm6,2
+ add edx,DWORD [52+esp]
+ xor eax,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add eax,edx
+ add edx,DWORD [24+esp]
+ pshufd xmm7,xmm7,128
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [28+esp]
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ psrldq xmm7,8
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ paddd xmm1,xmm7
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [8+esp],eax
+ pshufd xmm7,xmm1,80
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ movdqa xmm6,xmm7
+ ror ecx,11
+ psrld xmm7,10
+ and ebx,eax
+ psrlq xmm6,17
+ xor ecx,esi
+ add edx,DWORD [56+esp]
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ psrlq xmm6,2
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm7,xmm6
+ mov esi,DWORD [24+esp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ pshufd xmm7,xmm7,8
+ xor esi,edi
+ ror edx,5
+ movdqa xmm6,[16+ebp]
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ pslldq xmm7,8
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm1,xmm7
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ paddd xmm6,xmm1
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [60+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ movdqa [48+esp],xmm6
+ mov ecx,edx
+ movdqa xmm4,xmm3
+ ror edx,14
+ mov esi,DWORD [20+esp]
+ movdqa xmm7,xmm1
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+db 102,15,58,15,226,4
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+db 102,15,58,15,248,4
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ movdqa xmm5,xmm4
+ ror edx,6
+ mov ecx,eax
+ movdqa xmm6,xmm4
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ psrld xmm4,3
+ mov esi,eax
+ ror ecx,9
+ paddd xmm2,xmm7
+ mov DWORD [esp],eax
+ xor ecx,eax
+ psrld xmm6,7
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ ror ecx,11
+ and ebx,eax
+ pshufd xmm7,xmm1,250
+ xor ecx,esi
+ add edx,DWORD [64+esp]
+ pslld xmm5,14
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm4,xmm6
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ psrld xmm6,11
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm4,xmm5
+ mov esi,DWORD [16+esp]
+ xor edx,ecx
+ pslld xmm5,11
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ ror edx,5
+ pxor xmm4,xmm6
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ movdqa xmm6,xmm7
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ pxor xmm4,xmm5
+ mov ecx,ebx
+ add edx,edi
+ psrld xmm7,10
+ mov edi,DWORD [esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm2,xmm4
+ mov DWORD [28+esp],ebx
+ xor ecx,ebx
+ psrlq xmm6,17
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ ror ecx,11
+ pxor xmm7,xmm6
+ and eax,ebx
+ xor ecx,esi
+ psrlq xmm6,2
+ add edx,DWORD [68+esp]
+ xor eax,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add eax,edx
+ add edx,DWORD [8+esp]
+ pshufd xmm7,xmm7,128
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [12+esp]
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ psrldq xmm7,8
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ paddd xmm2,xmm7
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [24+esp],eax
+ pshufd xmm7,xmm2,80
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ movdqa xmm6,xmm7
+ ror ecx,11
+ psrld xmm7,10
+ and ebx,eax
+ psrlq xmm6,17
+ xor ecx,esi
+ add edx,DWORD [72+esp]
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ psrlq xmm6,2
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm7,xmm6
+ mov esi,DWORD [8+esp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ pshufd xmm7,xmm7,8
+ xor esi,edi
+ ror edx,5
+ movdqa xmm6,[32+ebp]
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ pslldq xmm7,8
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm2,xmm7
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ paddd xmm6,xmm2
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [76+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ movdqa [64+esp],xmm6
+ mov ecx,edx
+ movdqa xmm4,xmm0
+ ror edx,14
+ mov esi,DWORD [4+esp]
+ movdqa xmm7,xmm2
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+db 102,15,58,15,227,4
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+db 102,15,58,15,249,4
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ movdqa xmm5,xmm4
+ ror edx,6
+ mov ecx,eax
+ movdqa xmm6,xmm4
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ psrld xmm4,3
+ mov esi,eax
+ ror ecx,9
+ paddd xmm3,xmm7
+ mov DWORD [16+esp],eax
+ xor ecx,eax
+ psrld xmm6,7
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ ror ecx,11
+ and ebx,eax
+ pshufd xmm7,xmm2,250
+ xor ecx,esi
+ add edx,DWORD [80+esp]
+ pslld xmm5,14
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm4,xmm6
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ psrld xmm6,11
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm4,xmm5
+ mov esi,DWORD [esp]
+ xor edx,ecx
+ pslld xmm5,11
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ ror edx,5
+ pxor xmm4,xmm6
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ movdqa xmm6,xmm7
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ pxor xmm4,xmm5
+ mov ecx,ebx
+ add edx,edi
+ psrld xmm7,10
+ mov edi,DWORD [16+esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm3,xmm4
+ mov DWORD [12+esp],ebx
+ xor ecx,ebx
+ psrlq xmm6,17
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ ror ecx,11
+ pxor xmm7,xmm6
+ and eax,ebx
+ xor ecx,esi
+ psrlq xmm6,2
+ add edx,DWORD [84+esp]
+ xor eax,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add eax,edx
+ add edx,DWORD [24+esp]
+ pshufd xmm7,xmm7,128
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [28+esp]
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ psrldq xmm7,8
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ paddd xmm3,xmm7
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [8+esp],eax
+ pshufd xmm7,xmm3,80
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ movdqa xmm6,xmm7
+ ror ecx,11
+ psrld xmm7,10
+ and ebx,eax
+ psrlq xmm6,17
+ xor ecx,esi
+ add edx,DWORD [88+esp]
+ xor ebx,edi
+ ror ecx,2
+ pxor xmm7,xmm6
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ psrlq xmm6,2
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ pxor xmm7,xmm6
+ mov esi,DWORD [24+esp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ pshufd xmm7,xmm7,8
+ xor esi,edi
+ ror edx,5
+ movdqa xmm6,[48+ebp]
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ pslldq xmm7,8
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ ror ecx,9
+ paddd xmm3,xmm7
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ paddd xmm6,xmm3
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [92+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ movdqa [80+esp],xmm6
+ cmp DWORD [64+ebp],66051
+ jne NEAR L$009ssse3_00_47
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [20+esp]
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [32+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [16+esp]
+ xor edx,ecx
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [28+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [36+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [12+esp]
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [24+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [40+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [8+esp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [44+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [4+esp]
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [16+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [48+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [esp]
+ xor edx,ecx
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [16+esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [12+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [52+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [28+esp]
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [8+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [56+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [24+esp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [60+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [20+esp]
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [64+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [16+esp]
+ xor edx,ecx
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [28+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [68+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [12+esp]
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [24+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [72+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [8+esp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [76+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [4+esp]
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [16+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [80+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [esp]
+ xor edx,ecx
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [16+esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [12+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [84+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [28+esp]
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ mov esi,eax
+ ror ecx,9
+ mov DWORD [8+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ ror ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [88+esp]
+ xor ebx,edi
+ ror ecx,2
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ add ebx,ecx
+ mov ecx,edx
+ ror edx,14
+ mov esi,DWORD [24+esp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ xor esi,edi
+ ror edx,5
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ ror edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ ror ecx,9
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ ror ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [92+esp]
+ xor eax,edi
+ ror ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ mov esi,DWORD [96+esp]
+ xor ebx,edi
+ mov ecx,DWORD [12+esp]
+ add eax,DWORD [esi]
+ add ebx,DWORD [4+esi]
+ add edi,DWORD [8+esi]
+ add ecx,DWORD [12+esi]
+ mov DWORD [esi],eax
+ mov DWORD [4+esi],ebx
+ mov DWORD [8+esi],edi
+ mov DWORD [12+esi],ecx
+ mov DWORD [4+esp],ebx
+ xor ebx,edi
+ mov DWORD [8+esp],edi
+ mov DWORD [12+esp],ecx
+ mov edi,DWORD [20+esp]
+ mov ecx,DWORD [24+esp]
+ add edx,DWORD [16+esi]
+ add edi,DWORD [20+esi]
+ add ecx,DWORD [24+esi]
+ mov DWORD [16+esi],edx
+ mov DWORD [20+esi],edi
+ mov DWORD [20+esp],edi
+ mov edi,DWORD [28+esp]
+ mov DWORD [24+esi],ecx
+ add edi,DWORD [28+esi]
+ mov DWORD [24+esp],ecx
+ mov DWORD [28+esi],edi
+ mov DWORD [28+esp],edi
+ mov edi,DWORD [100+esp]
+ movdqa xmm7,[64+ebp]
+ sub ebp,192
+ cmp edi,DWORD [104+esp]
+ jb NEAR L$008grand_ssse3
+ mov esp,DWORD [108+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _sha256_block_data_order_avx
+align 16
+_sha256_block_data_order_avx:
+L$_sha256_block_data_order_avx_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov ebx,esp
+ call L$010pic_point
+L$010pic_point:
+ pop ebp
+ lea ebp,[(L$K256-L$010pic_point)+ebp]
+ sub esp,16
+ and esp,-64
+ shl eax,6
+ add eax,edi
+ mov DWORD [esp],esi
+ mov DWORD [4+esp],edi
+ mov DWORD [8+esp],eax
+ mov DWORD [12+esp],ebx
+ lea esp,[esp-96]
+ vzeroall
+ mov eax,DWORD [esi]
+ mov ebx,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov edi,DWORD [12+esi]
+ mov DWORD [4+esp],ebx
+ xor ebx,ecx
+ mov DWORD [8+esp],ecx
+ mov DWORD [12+esp],edi
+ mov edx,DWORD [16+esi]
+ mov edi,DWORD [20+esi]
+ mov ecx,DWORD [24+esi]
+ mov esi,DWORD [28+esi]
+ mov DWORD [20+esp],edi
+ mov edi,DWORD [100+esp]
+ mov DWORD [24+esp],ecx
+ mov DWORD [28+esp],esi
+ vmovdqa xmm7,[256+ebp]
+ jmp NEAR L$011grand_avx
+align 32
+L$011grand_avx:
+ vmovdqu xmm0,[edi]
+ vmovdqu xmm1,[16+edi]
+ vmovdqu xmm2,[32+edi]
+ vmovdqu xmm3,[48+edi]
+ add edi,64
+ vpshufb xmm0,xmm0,xmm7
+ mov DWORD [100+esp],edi
+ vpshufb xmm1,xmm1,xmm7
+ vpshufb xmm2,xmm2,xmm7
+ vpaddd xmm4,xmm0,[ebp]
+ vpshufb xmm3,xmm3,xmm7
+ vpaddd xmm5,xmm1,[16+ebp]
+ vpaddd xmm6,xmm2,[32+ebp]
+ vpaddd xmm7,xmm3,[48+ebp]
+ vmovdqa [32+esp],xmm4
+ vmovdqa [48+esp],xmm5
+ vmovdqa [64+esp],xmm6
+ vmovdqa [80+esp],xmm7
+ jmp NEAR L$012avx_00_47
+align 16
+L$012avx_00_47:
+ add ebp,64
+ vpalignr xmm4,xmm1,xmm0,4
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [20+esp]
+ vpalignr xmm7,xmm3,xmm2,4
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm4,7
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ vpaddd xmm0,xmm0,xmm7
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrld xmm7,xmm4,3
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ vpslld xmm5,xmm4,14
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [esp],eax
+ vpxor xmm4,xmm7,xmm6
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ vpshufd xmm7,xmm3,250
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpsrld xmm6,xmm6,11
+ add edx,DWORD [32+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpxor xmm4,xmm4,xmm5
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ add ebx,ecx
+ vpslld xmm5,xmm5,11
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [16+esp]
+ vpxor xmm4,xmm4,xmm6
+ xor edx,ecx
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm7,10
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ vpxor xmm4,xmm4,xmm5
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [esp]
+ vpaddd xmm0,xmm0,xmm4
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [28+esp],ebx
+ vpxor xmm6,xmm6,xmm5
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ vpsrlq xmm7,xmm7,19
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ vpxor xmm6,xmm6,xmm7
+ add edx,DWORD [36+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ vpshufd xmm7,xmm6,132
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,ecx
+ vpsrldq xmm7,xmm7,8
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [12+esp]
+ vpaddd xmm0,xmm0,xmm7
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ vpshufd xmm7,xmm0,80
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ vpsrld xmm6,xmm7,10
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ vpxor xmm6,xmm6,xmm5
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [24+esp],eax
+ vpsrlq xmm7,xmm7,19
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ vpxor xmm6,xmm6,xmm7
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpshufd xmm7,xmm6,232
+ add edx,DWORD [40+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpslldq xmm7,xmm7,8
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ add ebx,ecx
+ vpaddd xmm0,xmm0,xmm7
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [8+esp]
+ vpaddd xmm6,xmm0,[ebp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [44+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ vmovdqa [32+esp],xmm6
+ vpalignr xmm4,xmm2,xmm1,4
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [4+esp]
+ vpalignr xmm7,xmm0,xmm3,4
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm4,7
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ vpaddd xmm1,xmm1,xmm7
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrld xmm7,xmm4,3
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ vpslld xmm5,xmm4,14
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [16+esp],eax
+ vpxor xmm4,xmm7,xmm6
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ vpshufd xmm7,xmm0,250
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpsrld xmm6,xmm6,11
+ add edx,DWORD [48+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpxor xmm4,xmm4,xmm5
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ add ebx,ecx
+ vpslld xmm5,xmm5,11
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [esp]
+ vpxor xmm4,xmm4,xmm6
+ xor edx,ecx
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm7,10
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ vpxor xmm4,xmm4,xmm5
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [16+esp]
+ vpaddd xmm1,xmm1,xmm4
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [12+esp],ebx
+ vpxor xmm6,xmm6,xmm5
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ vpsrlq xmm7,xmm7,19
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ vpxor xmm6,xmm6,xmm7
+ add edx,DWORD [52+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ vpshufd xmm7,xmm6,132
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,ecx
+ vpsrldq xmm7,xmm7,8
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [28+esp]
+ vpaddd xmm1,xmm1,xmm7
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ vpshufd xmm7,xmm1,80
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ vpsrld xmm6,xmm7,10
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ vpxor xmm6,xmm6,xmm5
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [8+esp],eax
+ vpsrlq xmm7,xmm7,19
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ vpxor xmm6,xmm6,xmm7
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpshufd xmm7,xmm6,232
+ add edx,DWORD [56+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpslldq xmm7,xmm7,8
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ add ebx,ecx
+ vpaddd xmm1,xmm1,xmm7
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [24+esp]
+ vpaddd xmm6,xmm1,[16+ebp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [60+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ vmovdqa [48+esp],xmm6
+ vpalignr xmm4,xmm3,xmm2,4
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [20+esp]
+ vpalignr xmm7,xmm1,xmm0,4
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm4,7
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ vpaddd xmm2,xmm2,xmm7
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrld xmm7,xmm4,3
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ vpslld xmm5,xmm4,14
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [esp],eax
+ vpxor xmm4,xmm7,xmm6
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ vpshufd xmm7,xmm1,250
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpsrld xmm6,xmm6,11
+ add edx,DWORD [64+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpxor xmm4,xmm4,xmm5
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ add ebx,ecx
+ vpslld xmm5,xmm5,11
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [16+esp]
+ vpxor xmm4,xmm4,xmm6
+ xor edx,ecx
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm7,10
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ vpxor xmm4,xmm4,xmm5
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [esp]
+ vpaddd xmm2,xmm2,xmm4
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [28+esp],ebx
+ vpxor xmm6,xmm6,xmm5
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ vpsrlq xmm7,xmm7,19
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ vpxor xmm6,xmm6,xmm7
+ add edx,DWORD [68+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ vpshufd xmm7,xmm6,132
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,ecx
+ vpsrldq xmm7,xmm7,8
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [12+esp]
+ vpaddd xmm2,xmm2,xmm7
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ vpshufd xmm7,xmm2,80
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ vpsrld xmm6,xmm7,10
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ vpxor xmm6,xmm6,xmm5
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [24+esp],eax
+ vpsrlq xmm7,xmm7,19
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ vpxor xmm6,xmm6,xmm7
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpshufd xmm7,xmm6,232
+ add edx,DWORD [72+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpslldq xmm7,xmm7,8
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ add ebx,ecx
+ vpaddd xmm2,xmm2,xmm7
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [8+esp]
+ vpaddd xmm6,xmm2,[32+ebp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [76+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ vmovdqa [64+esp],xmm6
+ vpalignr xmm4,xmm0,xmm3,4
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [4+esp]
+ vpalignr xmm7,xmm2,xmm1,4
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm4,7
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ vpaddd xmm3,xmm3,xmm7
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrld xmm7,xmm4,3
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ vpslld xmm5,xmm4,14
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [16+esp],eax
+ vpxor xmm4,xmm7,xmm6
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ vpshufd xmm7,xmm2,250
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpsrld xmm6,xmm6,11
+ add edx,DWORD [80+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpxor xmm4,xmm4,xmm5
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ add ebx,ecx
+ vpslld xmm5,xmm5,11
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [esp]
+ vpxor xmm4,xmm4,xmm6
+ xor edx,ecx
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ vpsrld xmm6,xmm7,10
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ vpxor xmm4,xmm4,xmm5
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [16+esp]
+ vpaddd xmm3,xmm3,xmm4
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [12+esp],ebx
+ vpxor xmm6,xmm6,xmm5
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ vpsrlq xmm7,xmm7,19
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ vpxor xmm6,xmm6,xmm7
+ add edx,DWORD [84+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ vpshufd xmm7,xmm6,132
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,ecx
+ vpsrldq xmm7,xmm7,8
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [28+esp]
+ vpaddd xmm3,xmm3,xmm7
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ vpshufd xmm7,xmm3,80
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ vpsrld xmm6,xmm7,10
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ vpsrlq xmm5,xmm7,17
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ vpxor xmm6,xmm6,xmm5
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [8+esp],eax
+ vpsrlq xmm7,xmm7,19
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ vpxor xmm6,xmm6,xmm7
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ vpshufd xmm7,xmm6,232
+ add edx,DWORD [88+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ vpslldq xmm7,xmm7,8
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ add ebx,ecx
+ vpaddd xmm3,xmm3,xmm7
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [24+esp]
+ vpaddd xmm6,xmm3,[48+ebp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [92+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ vmovdqa [80+esp],xmm6
+ cmp DWORD [64+ebp],66051
+ jne NEAR L$012avx_00_47
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [20+esp]
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [32+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [16+esp]
+ xor edx,ecx
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [28+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [36+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [12+esp]
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [24+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [40+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [8+esp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [44+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [4+esp]
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [16+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [48+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [esp]
+ xor edx,ecx
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [16+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [12+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [52+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [28+esp]
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [8+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [56+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [24+esp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [60+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [20+esp]
+ xor edx,ecx
+ mov edi,DWORD [24+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [16+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [4+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [28+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [64+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [12+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [16+esp]
+ xor edx,ecx
+ mov edi,DWORD [20+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [12+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [28+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [24+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [68+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [8+esp]
+ add eax,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [12+esp]
+ xor edx,ecx
+ mov edi,DWORD [16+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [8+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [28+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [24+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [20+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [72+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [4+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [8+esp]
+ xor edx,ecx
+ mov edi,DWORD [12+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [4+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [24+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [20+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [16+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [76+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [esp]
+ add eax,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [4+esp]
+ xor edx,ecx
+ mov edi,DWORD [8+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [20+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [16+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [12+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [80+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [28+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [esp]
+ xor edx,ecx
+ mov edi,DWORD [4+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [28+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [16+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [12+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [8+esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [84+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [24+esp]
+ add eax,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [28+esp]
+ xor edx,ecx
+ mov edi,DWORD [esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [24+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,eax
+ add edx,edi
+ mov edi,DWORD [12+esp]
+ mov esi,eax
+ shrd ecx,ecx,9
+ mov DWORD [8+esp],eax
+ xor ecx,eax
+ xor eax,edi
+ add edx,DWORD [4+esp]
+ shrd ecx,ecx,11
+ and ebx,eax
+ xor ecx,esi
+ add edx,DWORD [88+esp]
+ xor ebx,edi
+ shrd ecx,ecx,2
+ add ebx,edx
+ add edx,DWORD [20+esp]
+ add ebx,ecx
+ mov ecx,edx
+ shrd edx,edx,14
+ mov esi,DWORD [24+esp]
+ xor edx,ecx
+ mov edi,DWORD [28+esp]
+ xor esi,edi
+ shrd edx,edx,5
+ and esi,ecx
+ mov DWORD [20+esp],ecx
+ xor edx,ecx
+ xor edi,esi
+ shrd edx,edx,6
+ mov ecx,ebx
+ add edx,edi
+ mov edi,DWORD [8+esp]
+ mov esi,ebx
+ shrd ecx,ecx,9
+ mov DWORD [4+esp],ebx
+ xor ecx,ebx
+ xor ebx,edi
+ add edx,DWORD [esp]
+ shrd ecx,ecx,11
+ and eax,ebx
+ xor ecx,esi
+ add edx,DWORD [92+esp]
+ xor eax,edi
+ shrd ecx,ecx,2
+ add eax,edx
+ add edx,DWORD [16+esp]
+ add eax,ecx
+ mov esi,DWORD [96+esp]
+ xor ebx,edi
+ mov ecx,DWORD [12+esp]
+ add eax,DWORD [esi]
+ add ebx,DWORD [4+esi]
+ add edi,DWORD [8+esi]
+ add ecx,DWORD [12+esi]
+ mov DWORD [esi],eax
+ mov DWORD [4+esi],ebx
+ mov DWORD [8+esi],edi
+ mov DWORD [12+esi],ecx
+ mov DWORD [4+esp],ebx
+ xor ebx,edi
+ mov DWORD [8+esp],edi
+ mov DWORD [12+esp],ecx
+ mov edi,DWORD [20+esp]
+ mov ecx,DWORD [24+esp]
+ add edx,DWORD [16+esi]
+ add edi,DWORD [20+esi]
+ add ecx,DWORD [24+esi]
+ mov DWORD [16+esi],edx
+ mov DWORD [20+esi],edi
+ mov DWORD [20+esp],edi
+ mov edi,DWORD [28+esp]
+ mov DWORD [24+esi],ecx
+ add edi,DWORD [28+esi]
+ mov DWORD [24+esp],ecx
+ mov DWORD [28+esi],edi
+ mov DWORD [28+esp],edi
+ mov edi,DWORD [100+esp]
+ vmovdqa xmm7,[64+ebp]
+ sub ebp,192
+ cmp edi,DWORD [104+esp]
+ jb NEAR L$011grand_avx
+ mov esp,DWORD [108+esp]
+ vzeroall
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha256-armv4-linux.S b/gen/bcm/sha256-armv4-linux.S
new file mode 100644
index 0000000..fca0681
--- /dev/null
+++ b/gen/bcm/sha256-armv4-linux.S
@@ -0,0 +1,2839 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+.arch armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+.align 5
+
+.globl sha256_block_data_order_nohw
+.hidden sha256_block_data_order_nohw
+.type sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ adr r14,K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.LK256_shortcut_neon:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word K256-(.LK256_add_neon+4)
+#else
+.word K256-(.LK256_add_neon+8)
+#endif
+
+.globl sha256_block_data_order_neon
+.hidden sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 5
+.skip 16
+sha256_block_data_order_neon:
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+ sub r11,sp,#16*4+16
+
+ @ K256 is just at the boundary of being easily referenced by an ADR from
+ @ this function. In Arm mode, when building with __ARM_ARCH=6, it does
+ @ not fit. By moving code around, we could make it fit, but this is too
+ @ fragile. For simplicity, just load the offset from
+ @ .LK256_shortcut_neon.
+ @
+ @ TODO(davidben): adrl would avoid a load, but clang-assembler does not
+ @ support it. We might be able to emulate it with a macro, but Android's
+ @ did not work when I tried it.
+ @ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
+ ldr r14,.LK256_shortcut_neon
+.LK256_add_neon:
+ add r14,pc,r14
+
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8,r9,r10,r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.LK256_shortcut_hw:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word K256-(.LK256_add_hw+4)
+#else
+.word K256-(.LK256_add_hw+8)
+#endif
+
+.globl sha256_block_data_order_hw
+.hidden sha256_block_data_order_hw
+.type sha256_block_data_order_hw,%function
+.align 5
+sha256_block_data_order_hw:
+ @ K256 is too far to reference from one ADR command in Thumb mode. In
+ @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+ @ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
+ ldr r3,.LK256_shortcut_hw
+.LK256_add_hw:
+ add r3,pc,r3
+
+ vld1.32 {q0,q1},[r0]
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ b .Loop_v8
+
+.align 4
+.Loop_v8:
+ vld1.8 {q8,q9},[r1]!
+ vld1.8 {q10,q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/sha256-armv8-apple.S b/gen/bcm/sha256-armv8-apple.S
new file mode 100644
index 0000000..a78236b
--- /dev/null
+++ b/gen/bcm/sha256-armv8-apple.S
@@ -0,0 +1,1193 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl _sha256_block_data_order_nohw
+.private_extern _sha256_block_data_order_nohw
+
+.align 6
+_sha256_block_data_order_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*4
+
+ ldp w20,w21,[x0] // load context
+ ldp w22,w23,[x0,#2*4]
+ ldp w24,w25,[x0,#4*4]
+ add x2,x1,x2,lsl#6 // end of input
+ ldp w26,w27,[x0,#6*4]
+ adrp x30,LK256@PAGE
+ add x30,x30,LK256@PAGEOFF
+ stp x0,x2,[x29,#96]
+
+Loop:
+ ldp w3,w4,[x1],#2*4
+ ldr w19,[x30],#4 // *K++
+ eor w28,w21,w22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev w3,w3 // 0
+#endif
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w6,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w3 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w4,w4 // 1
+#endif
+ ldp w5,w6,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w7,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w4 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w5,w5 // 2
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w8,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w5 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w6,w6 // 3
+#endif
+ ldp w7,w8,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w9,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w6 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w7,w7 // 4
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w10,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w7 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w10,ror#11 // Sigma1(e)
+ ror w10,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w10,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w8,w8 // 5
+#endif
+ ldp w9,w10,[x1],#2*4
+ add w23,w23,w17 // h+=Sigma0(a)
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w11,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w8 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w11,ror#11 // Sigma1(e)
+ ror w11,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w11,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w9,w9 // 6
+#endif
+ add w22,w22,w17 // h+=Sigma0(a)
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w12,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w9 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w12,ror#11 // Sigma1(e)
+ ror w12,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w12,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w10,w10 // 7
+#endif
+ ldp w11,w12,[x1],#2*4
+ add w21,w21,w17 // h+=Sigma0(a)
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ eor w13,w25,w25,ror#14
+ and w17,w26,w25
+ bic w28,w27,w25
+ add w20,w20,w10 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w13,ror#11 // Sigma1(e)
+ ror w13,w21,#2
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ eor w17,w21,w21,ror#9
+ add w20,w20,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w24,w24,w20 // d+=h
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w13,w17,ror#13 // Sigma0(a)
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w20,w20,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w11,w11 // 8
+#endif
+ add w20,w20,w17 // h+=Sigma0(a)
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w14,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w11 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w14,ror#11 // Sigma1(e)
+ ror w14,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w14,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w12,w12 // 9
+#endif
+ ldp w13,w14,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w15,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w12 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w15,ror#11 // Sigma1(e)
+ ror w15,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w15,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w13,w13 // 10
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w0,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w13 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w0,ror#11 // Sigma1(e)
+ ror w0,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w0,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w14,w14 // 11
+#endif
+ ldp w15,w0,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w6,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w14 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w15,w15 // 12
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w7,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w15 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w0,w0 // 13
+#endif
+ ldp w1,w2,[x1]
+ add w23,w23,w17 // h+=Sigma0(a)
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w8,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w0 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w1,w1 // 14
+#endif
+ ldr w6,[sp,#12]
+ add w22,w22,w17 // h+=Sigma0(a)
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w9,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w1 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w2,w2 // 15
+#endif
+ ldr w7,[sp,#0]
+ add w21,w21,w17 // h+=Sigma0(a)
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+Loop_16_xx:
+ ldr w8,[sp,#4]
+ str w11,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w10,w5,#7
+ and w17,w25,w24
+ ror w9,w2,#17
+ bic w19,w26,w24
+ ror w11,w20,#2
+ add w27,w27,w3 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w10,w10,w5,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w11,w11,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w9,w9,w2,ror#19
+ eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w11,w20,ror#22 // Sigma0(a)
+ eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
+ add w4,w4,w13
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w4,w4,w10
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w4,w4,w9
+ ldr w9,[sp,#8]
+ str w12,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w11,w6,#7
+ and w17,w24,w23
+ ror w10,w3,#17
+ bic w28,w25,w23
+ ror w12,w27,#2
+ add w26,w26,w4 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w11,w11,w6,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w12,w12,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w10,w10,w3,ror#19
+ eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w12,w27,ror#22 // Sigma0(a)
+ eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
+ add w5,w5,w14
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w5,w5,w11
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w5,w5,w10
+ ldr w10,[sp,#12]
+ str w13,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w12,w7,#7
+ and w17,w23,w22
+ ror w11,w4,#17
+ bic w19,w24,w22
+ ror w13,w26,#2
+ add w25,w25,w5 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w12,w12,w7,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w13,w13,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w11,w11,w4,ror#19
+ eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w13,w26,ror#22 // Sigma0(a)
+ eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
+ add w6,w6,w15
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w6,w6,w12
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w6,w6,w11
+ ldr w11,[sp,#0]
+ str w14,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w13,w8,#7
+ and w17,w22,w21
+ ror w12,w5,#17
+ bic w28,w23,w21
+ ror w14,w25,#2
+ add w24,w24,w6 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w13,w13,w8,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w14,w14,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w12,w12,w5,ror#19
+ eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w14,w25,ror#22 // Sigma0(a)
+ eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
+ add w7,w7,w0
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w7,w7,w13
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w7,w7,w12
+ ldr w12,[sp,#4]
+ str w15,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w14,w9,#7
+ and w17,w21,w20
+ ror w13,w6,#17
+ bic w19,w22,w20
+ ror w15,w24,#2
+ add w23,w23,w7 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w14,w14,w9,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w15,w15,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w13,w13,w6,ror#19
+ eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w15,w24,ror#22 // Sigma0(a)
+ eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
+ add w8,w8,w1
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w8,w8,w14
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w8,w8,w13
+ ldr w13,[sp,#8]
+ str w0,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w15,w10,#7
+ and w17,w20,w27
+ ror w14,w7,#17
+ bic w28,w21,w27
+ ror w0,w23,#2
+ add w22,w22,w8 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w15,w15,w10,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w0,w0,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w14,w14,w7,ror#19
+ eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w0,w23,ror#22 // Sigma0(a)
+ eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
+ add w9,w9,w2
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w9,w9,w15
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w9,w9,w14
+ ldr w14,[sp,#12]
+ str w1,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w0,w11,#7
+ and w17,w27,w26
+ ror w15,w8,#17
+ bic w19,w20,w26
+ ror w1,w22,#2
+ add w21,w21,w9 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w0,w0,w11,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w1,w1,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w15,w15,w8,ror#19
+ eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w1,w22,ror#22 // Sigma0(a)
+ eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
+ add w10,w10,w3
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w10,w10,w0
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w10,w10,w15
+ ldr w15,[sp,#0]
+ str w2,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w1,w12,#7
+ and w17,w26,w25
+ ror w0,w9,#17
+ bic w28,w27,w25
+ ror w2,w21,#2
+ add w20,w20,w10 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w1,w1,w12,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w2,w2,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w0,w0,w9,ror#19
+ eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w2,w21,ror#22 // Sigma0(a)
+ eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
+ add w11,w11,w4
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w11,w11,w1
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w11,w11,w0
+ ldr w0,[sp,#4]
+ str w3,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w2,w13,#7
+ and w17,w25,w24
+ ror w1,w10,#17
+ bic w19,w26,w24
+ ror w3,w20,#2
+ add w27,w27,w11 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w2,w2,w13,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w3,w3,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w1,w1,w10,ror#19
+ eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w3,w20,ror#22 // Sigma0(a)
+ eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
+ add w12,w12,w5
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w12,w12,w2
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w12,w12,w1
+ ldr w1,[sp,#8]
+ str w4,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w3,w14,#7
+ and w17,w24,w23
+ ror w2,w11,#17
+ bic w28,w25,w23
+ ror w4,w27,#2
+ add w26,w26,w12 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w3,w3,w14,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w4,w4,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w2,w2,w11,ror#19
+ eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w4,w27,ror#22 // Sigma0(a)
+ eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
+ add w13,w13,w6
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w13,w13,w3
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w13,w13,w2
+ ldr w2,[sp,#12]
+ str w5,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w4,w15,#7
+ and w17,w23,w22
+ ror w3,w12,#17
+ bic w19,w24,w22
+ ror w5,w26,#2
+ add w25,w25,w13 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w4,w4,w15,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w5,w5,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w3,w3,w12,ror#19
+ eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w5,w26,ror#22 // Sigma0(a)
+ eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
+ add w14,w14,w7
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w14,w14,w4
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w14,w14,w3
+ ldr w3,[sp,#0]
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w5,w0,#7
+ and w17,w22,w21
+ ror w4,w13,#17
+ bic w28,w23,w21
+ ror w6,w25,#2
+ add w24,w24,w14 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w5,w5,w0,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w6,w6,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w4,w4,w13,ror#19
+ eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w25,ror#22 // Sigma0(a)
+ eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
+ add w15,w15,w8
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w15,w15,w5
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w15,w15,w4
+ ldr w4,[sp,#4]
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w6,w1,#7
+ and w17,w21,w20
+ ror w5,w14,#17
+ bic w19,w22,w20
+ ror w7,w24,#2
+ add w23,w23,w15 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w6,w6,w1,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w7,w7,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w5,w5,w14,ror#19
+ eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w24,ror#22 // Sigma0(a)
+ eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
+ add w0,w0,w9
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w0,w0,w6
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w0,w0,w5
+ ldr w5,[sp,#8]
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w7,w2,#7
+ and w17,w20,w27
+ ror w6,w15,#17
+ bic w28,w21,w27
+ ror w8,w23,#2
+ add w22,w22,w0 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w7,w7,w2,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w8,w8,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w6,w6,w15,ror#19
+ eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w23,ror#22 // Sigma0(a)
+ eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
+ add w1,w1,w10
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w1,w1,w7
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w1,w1,w6
+ ldr w6,[sp,#12]
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w8,w3,#7
+ and w17,w27,w26
+ ror w7,w0,#17
+ bic w19,w20,w26
+ ror w9,w22,#2
+ add w21,w21,w1 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w8,w8,w3,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w9,w9,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w7,w7,w0,ror#19
+ eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w22,ror#22 // Sigma0(a)
+ eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
+ add w2,w2,w11
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w2,w2,w8
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w2,w2,w7
+ ldr w7,[sp,#0]
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+ cbnz w19,Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#260 // rewind
+
+ ldp w3,w4,[x0]
+ ldp w5,w6,[x0,#2*4]
+ add x1,x1,#14*4 // advance input pointer
+ ldp w7,w8,[x0,#4*4]
+ add w20,w20,w3
+ ldp w9,w10,[x0,#6*4]
+ add w21,w21,w4
+ add w22,w22,w5
+ add w23,w23,w6
+ stp w20,w21,[x0]
+ add w24,w24,w7
+ add w25,w25,w8
+ stp w22,w23,[x0,#2*4]
+ add w26,w26,w9
+ add w27,w27,w10
+ cmp x1,x2
+ stp w24,w25,[x0,#4*4]
+ stp w26,w27,[x0,#6*4]
+ b.ne Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*4
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.section __TEXT,__const
+.align 6
+
+LK256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0 //terminator
+
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+.globl _sha256_block_data_order_hw
+.private_extern _sha256_block_data_order_hw
+
+.align 6
+_sha256_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v0.4s,v1.4s},[x0]
+ adrp x3,LK256@PAGE
+ add x3,x3,LK256@PAGEOFF
+
+Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ ld1 {v16.4s},[x3],#16
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+ rev32 v6.16b,v6.16b
+ rev32 v7.16b,v7.16b
+ orr v18.16b,v0.16b,v0.16b // offload
+ orr v19.16b,v1.16b,v1.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ ld1 {v17.4s},[x3]
+ add v16.4s,v16.4s,v6.4s
+ sub x3,x3,#64*4-16 // rewind
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ add v17.4s,v17.4s,v7.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ add v0.4s,v0.4s,v18.4s
+ add v1.4s,v1.4s,v19.4s
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.4s,v1.4s},[x0]
+
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/sha256-armv8-linux.S b/gen/bcm/sha256-armv8-linux.S
new file mode 100644
index 0000000..4420108
--- /dev/null
+++ b/gen/bcm/sha256-armv8-linux.S
@@ -0,0 +1,1193 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl sha256_block_data_order_nohw
+.hidden sha256_block_data_order_nohw
+.type sha256_block_data_order_nohw,%function
+.align 6
+sha256_block_data_order_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*4
+
+ ldp w20,w21,[x0] // load context
+ ldp w22,w23,[x0,#2*4]
+ ldp w24,w25,[x0,#4*4]
+ add x2,x1,x2,lsl#6 // end of input
+ ldp w26,w27,[x0,#6*4]
+ adrp x30,.LK256
+ add x30,x30,:lo12:.LK256
+ stp x0,x2,[x29,#96]
+
+.Loop:
+ ldp w3,w4,[x1],#2*4
+ ldr w19,[x30],#4 // *K++
+ eor w28,w21,w22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev w3,w3 // 0
+#endif
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w6,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w3 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w4,w4 // 1
+#endif
+ ldp w5,w6,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w7,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w4 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w5,w5 // 2
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w8,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w5 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w6,w6 // 3
+#endif
+ ldp w7,w8,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w9,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w6 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w7,w7 // 4
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w10,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w7 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w10,ror#11 // Sigma1(e)
+ ror w10,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w10,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w8,w8 // 5
+#endif
+ ldp w9,w10,[x1],#2*4
+ add w23,w23,w17 // h+=Sigma0(a)
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w11,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w8 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w11,ror#11 // Sigma1(e)
+ ror w11,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w11,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w9,w9 // 6
+#endif
+ add w22,w22,w17 // h+=Sigma0(a)
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w12,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w9 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w12,ror#11 // Sigma1(e)
+ ror w12,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w12,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w10,w10 // 7
+#endif
+ ldp w11,w12,[x1],#2*4
+ add w21,w21,w17 // h+=Sigma0(a)
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ eor w13,w25,w25,ror#14
+ and w17,w26,w25
+ bic w28,w27,w25
+ add w20,w20,w10 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w13,ror#11 // Sigma1(e)
+ ror w13,w21,#2
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ eor w17,w21,w21,ror#9
+ add w20,w20,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w24,w24,w20 // d+=h
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w13,w17,ror#13 // Sigma0(a)
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w20,w20,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w11,w11 // 8
+#endif
+ add w20,w20,w17 // h+=Sigma0(a)
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w14,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w11 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w14,ror#11 // Sigma1(e)
+ ror w14,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w14,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w12,w12 // 9
+#endif
+ ldp w13,w14,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w15,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w12 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w15,ror#11 // Sigma1(e)
+ ror w15,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w15,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w13,w13 // 10
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w0,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w13 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w0,ror#11 // Sigma1(e)
+ ror w0,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w0,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w14,w14 // 11
+#endif
+ ldp w15,w0,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w6,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w14 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w15,w15 // 12
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w7,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w15 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w0,w0 // 13
+#endif
+ ldp w1,w2,[x1]
+ add w23,w23,w17 // h+=Sigma0(a)
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w8,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w0 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w1,w1 // 14
+#endif
+ ldr w6,[sp,#12]
+ add w22,w22,w17 // h+=Sigma0(a)
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w9,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w1 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w2,w2 // 15
+#endif
+ ldr w7,[sp,#0]
+ add w21,w21,w17 // h+=Sigma0(a)
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+.Loop_16_xx:
+ ldr w8,[sp,#4]
+ str w11,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w10,w5,#7
+ and w17,w25,w24
+ ror w9,w2,#17
+ bic w19,w26,w24
+ ror w11,w20,#2
+ add w27,w27,w3 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w10,w10,w5,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w11,w11,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w9,w9,w2,ror#19
+ eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w11,w20,ror#22 // Sigma0(a)
+ eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
+ add w4,w4,w13
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w4,w4,w10
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w4,w4,w9
+ ldr w9,[sp,#8]
+ str w12,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w11,w6,#7
+ and w17,w24,w23
+ ror w10,w3,#17
+ bic w28,w25,w23
+ ror w12,w27,#2
+ add w26,w26,w4 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w11,w11,w6,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w12,w12,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w10,w10,w3,ror#19
+ eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w12,w27,ror#22 // Sigma0(a)
+ eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
+ add w5,w5,w14
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w5,w5,w11
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w5,w5,w10
+ ldr w10,[sp,#12]
+ str w13,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w12,w7,#7
+ and w17,w23,w22
+ ror w11,w4,#17
+ bic w19,w24,w22
+ ror w13,w26,#2
+ add w25,w25,w5 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w12,w12,w7,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w13,w13,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w11,w11,w4,ror#19
+ eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w13,w26,ror#22 // Sigma0(a)
+ eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
+ add w6,w6,w15
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w6,w6,w12
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w6,w6,w11
+ ldr w11,[sp,#0]
+ str w14,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w13,w8,#7
+ and w17,w22,w21
+ ror w12,w5,#17
+ bic w28,w23,w21
+ ror w14,w25,#2
+ add w24,w24,w6 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w13,w13,w8,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w14,w14,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w12,w12,w5,ror#19
+ eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w14,w25,ror#22 // Sigma0(a)
+ eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
+ add w7,w7,w0
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w7,w7,w13
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w7,w7,w12
+ ldr w12,[sp,#4]
+ str w15,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w14,w9,#7
+ and w17,w21,w20
+ ror w13,w6,#17
+ bic w19,w22,w20
+ ror w15,w24,#2
+ add w23,w23,w7 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w14,w14,w9,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w15,w15,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w13,w13,w6,ror#19
+ eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w15,w24,ror#22 // Sigma0(a)
+ eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
+ add w8,w8,w1
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w8,w8,w14
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w8,w8,w13
+ ldr w13,[sp,#8]
+ str w0,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w15,w10,#7
+ and w17,w20,w27
+ ror w14,w7,#17
+ bic w28,w21,w27
+ ror w0,w23,#2
+ add w22,w22,w8 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w15,w15,w10,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w0,w0,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w14,w14,w7,ror#19
+ eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w0,w23,ror#22 // Sigma0(a)
+ eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
+ add w9,w9,w2
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w9,w9,w15
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w9,w9,w14
+ ldr w14,[sp,#12]
+ str w1,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w0,w11,#7
+ and w17,w27,w26
+ ror w15,w8,#17
+ bic w19,w20,w26
+ ror w1,w22,#2
+ add w21,w21,w9 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w0,w0,w11,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w1,w1,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w15,w15,w8,ror#19
+ eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w1,w22,ror#22 // Sigma0(a)
+ eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
+ add w10,w10,w3
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w10,w10,w0
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w10,w10,w15
+ ldr w15,[sp,#0]
+ str w2,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w1,w12,#7
+ and w17,w26,w25
+ ror w0,w9,#17
+ bic w28,w27,w25
+ ror w2,w21,#2
+ add w20,w20,w10 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w1,w1,w12,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w2,w2,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w0,w0,w9,ror#19
+ eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w2,w21,ror#22 // Sigma0(a)
+ eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
+ add w11,w11,w4
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w11,w11,w1
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w11,w11,w0
+ ldr w0,[sp,#4]
+ str w3,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w2,w13,#7
+ and w17,w25,w24
+ ror w1,w10,#17
+ bic w19,w26,w24
+ ror w3,w20,#2
+ add w27,w27,w11 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w2,w2,w13,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w3,w3,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w1,w1,w10,ror#19
+ eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w3,w20,ror#22 // Sigma0(a)
+ eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
+ add w12,w12,w5
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w12,w12,w2
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w12,w12,w1
+ ldr w1,[sp,#8]
+ str w4,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w3,w14,#7
+ and w17,w24,w23
+ ror w2,w11,#17
+ bic w28,w25,w23
+ ror w4,w27,#2
+ add w26,w26,w12 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w3,w3,w14,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w4,w4,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w2,w2,w11,ror#19
+ eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w4,w27,ror#22 // Sigma0(a)
+ eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
+ add w13,w13,w6
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w13,w13,w3
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w13,w13,w2
+ ldr w2,[sp,#12]
+ str w5,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w4,w15,#7
+ and w17,w23,w22
+ ror w3,w12,#17
+ bic w19,w24,w22
+ ror w5,w26,#2
+ add w25,w25,w13 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w4,w4,w15,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w5,w5,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w3,w3,w12,ror#19
+ eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w5,w26,ror#22 // Sigma0(a)
+ eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
+ add w14,w14,w7
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w14,w14,w4
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w14,w14,w3
+ ldr w3,[sp,#0]
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w5,w0,#7
+ and w17,w22,w21
+ ror w4,w13,#17
+ bic w28,w23,w21
+ ror w6,w25,#2
+ add w24,w24,w14 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w5,w5,w0,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w6,w6,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w4,w4,w13,ror#19
+ eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w25,ror#22 // Sigma0(a)
+ eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
+ add w15,w15,w8
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w15,w15,w5
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w15,w15,w4
+ ldr w4,[sp,#4]
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w6,w1,#7
+ and w17,w21,w20
+ ror w5,w14,#17
+ bic w19,w22,w20
+ ror w7,w24,#2
+ add w23,w23,w15 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w6,w6,w1,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w7,w7,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w5,w5,w14,ror#19
+ eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w24,ror#22 // Sigma0(a)
+ eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
+ add w0,w0,w9
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w0,w0,w6
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w0,w0,w5
+ ldr w5,[sp,#8]
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w7,w2,#7
+ and w17,w20,w27
+ ror w6,w15,#17
+ bic w28,w21,w27
+ ror w8,w23,#2
+ add w22,w22,w0 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w7,w7,w2,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w8,w8,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w6,w6,w15,ror#19
+ eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w23,ror#22 // Sigma0(a)
+ eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
+ add w1,w1,w10
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w1,w1,w7
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w1,w1,w6
+ ldr w6,[sp,#12]
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w8,w3,#7
+ and w17,w27,w26
+ ror w7,w0,#17
+ bic w19,w20,w26
+ ror w9,w22,#2
+ add w21,w21,w1 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w8,w8,w3,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w9,w9,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w7,w7,w0,ror#19
+ eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w22,ror#22 // Sigma0(a)
+ eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
+ add w2,w2,w11
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w2,w2,w8
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w2,w2,w7
+ ldr w7,[sp,#0]
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+ cbnz w19,.Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#260 // rewind
+
+ ldp w3,w4,[x0]
+ ldp w5,w6,[x0,#2*4]
+ add x1,x1,#14*4 // advance input pointer
+ ldp w7,w8,[x0,#4*4]
+ add w20,w20,w3
+ ldp w9,w10,[x0,#6*4]
+ add w21,w21,w4
+ add w22,w22,w5
+ add w23,w23,w6
+ stp w20,w21,[x0]
+ add w24,w24,w7
+ add w25,w25,w8
+ stp w22,w23,[x0,#2*4]
+ add w26,w26,w9
+ add w27,w27,w10
+ cmp x1,x2
+ stp w24,w25,[x0,#4*4]
+ stp w26,w27,[x0,#6*4]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*4
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+
+.section .rodata
+.align 6
+.type .LK256,%object
+.LK256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0 //terminator
+.size .LK256,.-.LK256
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+.globl sha256_block_data_order_hw
+.hidden sha256_block_data_order_hw
+.type sha256_block_data_order_hw,%function
+.align 6
+sha256_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v0.4s,v1.4s},[x0]
+ adrp x3,.LK256
+ add x3,x3,:lo12:.LK256
+
+.Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ ld1 {v16.4s},[x3],#16
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+ rev32 v6.16b,v6.16b
+ rev32 v7.16b,v7.16b
+ orr v18.16b,v0.16b,v0.16b // offload
+ orr v19.16b,v1.16b,v1.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ ld1 {v17.4s},[x3]
+ add v16.4s,v16.4s,v6.4s
+ sub x3,x3,#64*4-16 // rewind
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ add v17.4s,v17.4s,v7.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ add v0.4s,v0.4s,v18.4s
+ add v1.4s,v1.4s,v19.4s
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.4s,v1.4s},[x0]
+
+ ldr x29,[sp],#16
+ ret
+.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/sha256-armv8-win.S b/gen/bcm/sha256-armv8-win.S
new file mode 100644
index 0000000..89d3944
--- /dev/null
+++ b/gen/bcm/sha256-armv8-win.S
@@ -0,0 +1,1197 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl sha256_block_data_order_nohw
+
+.def sha256_block_data_order_nohw
+ .type 32
+.endef
+.align 6
+sha256_block_data_order_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*4
+
+ ldp w20,w21,[x0] // load context
+ ldp w22,w23,[x0,#2*4]
+ ldp w24,w25,[x0,#4*4]
+ add x2,x1,x2,lsl#6 // end of input
+ ldp w26,w27,[x0,#6*4]
+ adrp x30,LK256
+ add x30,x30,:lo12:LK256
+ stp x0,x2,[x29,#96]
+
+Loop:
+ ldp w3,w4,[x1],#2*4
+ ldr w19,[x30],#4 // *K++
+ eor w28,w21,w22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev w3,w3 // 0
+#endif
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w6,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w3 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w4,w4 // 1
+#endif
+ ldp w5,w6,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w7,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w4 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w5,w5 // 2
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w8,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w5 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w6,w6 // 3
+#endif
+ ldp w7,w8,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w9,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w6 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w7,w7 // 4
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w10,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w7 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w10,ror#11 // Sigma1(e)
+ ror w10,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w10,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w8,w8 // 5
+#endif
+ ldp w9,w10,[x1],#2*4
+ add w23,w23,w17 // h+=Sigma0(a)
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w11,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w8 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w11,ror#11 // Sigma1(e)
+ ror w11,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w11,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w9,w9 // 6
+#endif
+ add w22,w22,w17 // h+=Sigma0(a)
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w12,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w9 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w12,ror#11 // Sigma1(e)
+ ror w12,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w12,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w10,w10 // 7
+#endif
+ ldp w11,w12,[x1],#2*4
+ add w21,w21,w17 // h+=Sigma0(a)
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ eor w13,w25,w25,ror#14
+ and w17,w26,w25
+ bic w28,w27,w25
+ add w20,w20,w10 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w13,ror#11 // Sigma1(e)
+ ror w13,w21,#2
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ eor w17,w21,w21,ror#9
+ add w20,w20,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w24,w24,w20 // d+=h
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w13,w17,ror#13 // Sigma0(a)
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w20,w20,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w11,w11 // 8
+#endif
+ add w20,w20,w17 // h+=Sigma0(a)
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w14,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w11 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w14,ror#11 // Sigma1(e)
+ ror w14,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w14,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w12,w12 // 9
+#endif
+ ldp w13,w14,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w15,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w12 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w15,ror#11 // Sigma1(e)
+ ror w15,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w15,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w13,w13 // 10
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w0,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w13 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w0,ror#11 // Sigma1(e)
+ ror w0,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w0,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w14,w14 // 11
+#endif
+ ldp w15,w0,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w6,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w14 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w15,w15 // 12
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w7,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w15 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w0,w0 // 13
+#endif
+ ldp w1,w2,[x1]
+ add w23,w23,w17 // h+=Sigma0(a)
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w8,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w0 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w1,w1 // 14
+#endif
+ ldr w6,[sp,#12]
+ add w22,w22,w17 // h+=Sigma0(a)
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w9,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w1 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w2,w2 // 15
+#endif
+ ldr w7,[sp,#0]
+ add w21,w21,w17 // h+=Sigma0(a)
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+Loop_16_xx:
+ ldr w8,[sp,#4]
+ str w11,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w10,w5,#7
+ and w17,w25,w24
+ ror w9,w2,#17
+ bic w19,w26,w24
+ ror w11,w20,#2
+ add w27,w27,w3 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w10,w10,w5,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w11,w11,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w9,w9,w2,ror#19
+ eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w11,w20,ror#22 // Sigma0(a)
+ eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
+ add w4,w4,w13
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w4,w4,w10
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w4,w4,w9
+ ldr w9,[sp,#8]
+ str w12,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w11,w6,#7
+ and w17,w24,w23
+ ror w10,w3,#17
+ bic w28,w25,w23
+ ror w12,w27,#2
+ add w26,w26,w4 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w11,w11,w6,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w12,w12,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w10,w10,w3,ror#19
+ eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w12,w27,ror#22 // Sigma0(a)
+ eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
+ add w5,w5,w14
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w5,w5,w11
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w5,w5,w10
+ ldr w10,[sp,#12]
+ str w13,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w12,w7,#7
+ and w17,w23,w22
+ ror w11,w4,#17
+ bic w19,w24,w22
+ ror w13,w26,#2
+ add w25,w25,w5 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w12,w12,w7,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w13,w13,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w11,w11,w4,ror#19
+ eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w13,w26,ror#22 // Sigma0(a)
+ eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
+ add w6,w6,w15
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w6,w6,w12
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w6,w6,w11
+ ldr w11,[sp,#0]
+ str w14,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w13,w8,#7
+ and w17,w22,w21
+ ror w12,w5,#17
+ bic w28,w23,w21
+ ror w14,w25,#2
+ add w24,w24,w6 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w13,w13,w8,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w14,w14,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w12,w12,w5,ror#19
+ eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w14,w25,ror#22 // Sigma0(a)
+ eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
+ add w7,w7,w0
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w7,w7,w13
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w7,w7,w12
+ ldr w12,[sp,#4]
+ str w15,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w14,w9,#7
+ and w17,w21,w20
+ ror w13,w6,#17
+ bic w19,w22,w20
+ ror w15,w24,#2
+ add w23,w23,w7 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w14,w14,w9,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w15,w15,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w13,w13,w6,ror#19
+ eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w15,w24,ror#22 // Sigma0(a)
+ eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
+ add w8,w8,w1
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w8,w8,w14
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w8,w8,w13
+ ldr w13,[sp,#8]
+ str w0,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w15,w10,#7
+ and w17,w20,w27
+ ror w14,w7,#17
+ bic w28,w21,w27
+ ror w0,w23,#2
+ add w22,w22,w8 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w15,w15,w10,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w0,w0,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w14,w14,w7,ror#19
+ eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w0,w23,ror#22 // Sigma0(a)
+ eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
+ add w9,w9,w2
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w9,w9,w15
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w9,w9,w14
+ ldr w14,[sp,#12]
+ str w1,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w0,w11,#7
+ and w17,w27,w26
+ ror w15,w8,#17
+ bic w19,w20,w26
+ ror w1,w22,#2
+ add w21,w21,w9 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w0,w0,w11,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w1,w1,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w15,w15,w8,ror#19
+ eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w1,w22,ror#22 // Sigma0(a)
+ eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
+ add w10,w10,w3
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w10,w10,w0
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w10,w10,w15
+ ldr w15,[sp,#0]
+ str w2,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w1,w12,#7
+ and w17,w26,w25
+ ror w0,w9,#17
+ bic w28,w27,w25
+ ror w2,w21,#2
+ add w20,w20,w10 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w1,w1,w12,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w2,w2,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w0,w0,w9,ror#19
+ eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w2,w21,ror#22 // Sigma0(a)
+ eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
+ add w11,w11,w4
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w11,w11,w1
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w11,w11,w0
+ ldr w0,[sp,#4]
+ str w3,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w2,w13,#7
+ and w17,w25,w24
+ ror w1,w10,#17
+ bic w19,w26,w24
+ ror w3,w20,#2
+ add w27,w27,w11 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w2,w2,w13,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w3,w3,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w1,w1,w10,ror#19
+ eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w3,w20,ror#22 // Sigma0(a)
+ eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
+ add w12,w12,w5
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w12,w12,w2
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w12,w12,w1
+ ldr w1,[sp,#8]
+ str w4,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w3,w14,#7
+ and w17,w24,w23
+ ror w2,w11,#17
+ bic w28,w25,w23
+ ror w4,w27,#2
+ add w26,w26,w12 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w3,w3,w14,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w4,w4,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w2,w2,w11,ror#19
+ eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w4,w27,ror#22 // Sigma0(a)
+ eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
+ add w13,w13,w6
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w13,w13,w3
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w13,w13,w2
+ ldr w2,[sp,#12]
+ str w5,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w4,w15,#7
+ and w17,w23,w22
+ ror w3,w12,#17
+ bic w19,w24,w22
+ ror w5,w26,#2
+ add w25,w25,w13 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w4,w4,w15,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w5,w5,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w3,w3,w12,ror#19
+ eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w5,w26,ror#22 // Sigma0(a)
+ eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
+ add w14,w14,w7
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w14,w14,w4
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w14,w14,w3
+ ldr w3,[sp,#0]
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w5,w0,#7
+ and w17,w22,w21
+ ror w4,w13,#17
+ bic w28,w23,w21
+ ror w6,w25,#2
+ add w24,w24,w14 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w5,w5,w0,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w6,w6,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w4,w4,w13,ror#19
+ eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w25,ror#22 // Sigma0(a)
+ eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
+ add w15,w15,w8
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w15,w15,w5
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w15,w15,w4
+ ldr w4,[sp,#4]
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w6,w1,#7
+ and w17,w21,w20
+ ror w5,w14,#17
+ bic w19,w22,w20
+ ror w7,w24,#2
+ add w23,w23,w15 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w6,w6,w1,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w7,w7,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w5,w5,w14,ror#19
+ eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w24,ror#22 // Sigma0(a)
+ eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
+ add w0,w0,w9
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w0,w0,w6
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w0,w0,w5
+ ldr w5,[sp,#8]
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w7,w2,#7
+ and w17,w20,w27
+ ror w6,w15,#17
+ bic w28,w21,w27
+ ror w8,w23,#2
+ add w22,w22,w0 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w7,w7,w2,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w8,w8,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w6,w6,w15,ror#19
+ eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w23,ror#22 // Sigma0(a)
+ eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
+ add w1,w1,w10
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w1,w1,w7
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w1,w1,w6
+ ldr w6,[sp,#12]
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w8,w3,#7
+ and w17,w27,w26
+ ror w7,w0,#17
+ bic w19,w20,w26
+ ror w9,w22,#2
+ add w21,w21,w1 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w8,w8,w3,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w9,w9,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w7,w7,w0,ror#19
+ eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w22,ror#22 // Sigma0(a)
+ eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
+ add w2,w2,w11
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w2,w2,w8
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w2,w2,w7
+ ldr w7,[sp,#0]
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+ cbnz w19,Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#260 // rewind
+
+ ldp w3,w4,[x0]
+ ldp w5,w6,[x0,#2*4]
+ add x1,x1,#14*4 // advance input pointer
+ ldp w7,w8,[x0,#4*4]
+ add w20,w20,w3
+ ldp w9,w10,[x0,#6*4]
+ add w21,w21,w4
+ add w22,w22,w5
+ add w23,w23,w6
+ stp w20,w21,[x0]
+ add w24,w24,w7
+ add w25,w25,w8
+ stp w22,w23,[x0,#2*4]
+ add w26,w26,w9
+ add w27,w27,w10
+ cmp x1,x2
+ stp w24,w25,[x0,#4*4]
+ stp w26,w27,[x0,#6*4]
+ b.ne Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*4
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.section .rodata
+.align 6
+
+LK256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0 //terminator
+
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+.globl sha256_block_data_order_hw
+
+.def sha256_block_data_order_hw
+ .type 32
+.endef
+.align 6
+sha256_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v0.4s,v1.4s},[x0]
+ adrp x3,LK256
+ add x3,x3,:lo12:LK256
+
+Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ ld1 {v16.4s},[x3],#16
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+ rev32 v6.16b,v6.16b
+ rev32 v7.16b,v7.16b
+ orr v18.16b,v0.16b,v0.16b // offload
+ orr v19.16b,v1.16b,v1.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ ld1 {v17.4s},[x3]
+ add v16.4s,v16.4s,v6.4s
+ sub x3,x3,#64*4-16 // rewind
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ add v17.4s,v17.4s,v7.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ add v0.4s,v0.4s,v18.4s
+ add v1.4s,v1.4s,v19.4s
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.4s,v1.4s},[x0]
+
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/sha256-x86_64-apple.S b/gen/bcm/sha256-x86_64-apple.S
new file mode 100644
index 0000000..b33f807
--- /dev/null
+++ b/gen/bcm/sha256-x86_64-apple.S
@@ -0,0 +1,4170 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+.globl _sha256_block_data_order_nohw
+.private_extern _sha256_block_data_order_nohw
+
+.p2align 4
+_sha256_block_data_order_nohw:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ shlq $4,%rdx
+ subq $64+32,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+
+L$prologue:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ jmp L$loop
+
+.p2align 4
+L$loop:
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
+ movl 0(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 4(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 8(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 12(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 16(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 20(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 24(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 28(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
+ movl 32(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 36(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 40(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 44(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 48(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 52(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 56(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 60(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ jmp L$rounds_16_xx
+.p2align 4
+L$rounds_16_xx:
+ movl 4(%rsp),%r13d
+ movl 56(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
+
+ addl 0(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 8(%rsp),%r13d
+ movl 60(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
+
+ addl 4(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 12(%rsp),%r13d
+ movl 0(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
+
+ addl 8(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 16(%rsp),%r13d
+ movl 4(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
+
+ addl 12(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 20(%rsp),%r13d
+ movl 8(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
+
+ addl 16(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 24(%rsp),%r13d
+ movl 12(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
+
+ addl 20(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 28(%rsp),%r13d
+ movl 16(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
+
+ addl 24(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 32(%rsp),%r13d
+ movl 20(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
+
+ addl 28(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ movl 36(%rsp),%r13d
+ movl 24(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
+
+ addl 32(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 40(%rsp),%r13d
+ movl 28(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
+
+ addl 36(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 44(%rsp),%r13d
+ movl 32(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
+
+ addl 40(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 48(%rsp),%r13d
+ movl 36(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
+
+ addl 44(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 52(%rsp),%r13d
+ movl 40(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
+
+ addl 48(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 56(%rsp),%r13d
+ movl 44(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
+
+ addl 52(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 60(%rsp),%r13d
+ movl 48(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
+
+ addl 56(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 0(%rsp),%r13d
+ movl 52(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
+
+ addl 60(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz L$rounds_16_xx
+
+ movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
+ leaq 64(%rsi),%rsi
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb L$loop
+
+ movq 88(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$epilogue:
+ ret
+
+
+.section __DATA,__const
+.p2align 6
+
+K256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+.globl _sha256_block_data_order_hw
+.private_extern _sha256_block_data_order_hw
+
+.p2align 6
+_sha256_block_data_order_hw:
+
+_CET_ENDBR
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp L$oop_shaext
+
+.p2align 4
+L$oop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz L$oop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ ret
+
+
+.globl _sha256_block_data_order_ssse3
+.private_extern _sha256_block_data_order_ssse3
+
+.p2align 6
+_sha256_block_data_order_ssse3:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+
+L$prologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp L$loop_ssse3
+.p2align 4
+L$loop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+ movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp L$ssse3_00_47
+
+.p2align 4
+L$ssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne L$ssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb L$loop_ssse3
+
+ movq 88(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$epilogue_ssse3:
+ ret
+
+
+.globl _sha256_block_data_order_avx
+.private_extern _sha256_block_data_order_avx
+
+.p2align 6
+_sha256_block_data_order_avx:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+
+L$prologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp L$loop_avx
+.p2align 4
+L$loop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp L$avx_00_47
+
+.p2align 4
+L$avx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne L$avx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb L$loop_avx
+
+ movq 88(%rsp),%rsi
+
+ vzeroupper
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$epilogue_avx:
+ ret
+
+
+#endif
diff --git a/gen/bcm/sha256-x86_64-linux.S b/gen/bcm/sha256-x86_64-linux.S
new file mode 100644
index 0000000..8476b03
--- /dev/null
+++ b/gen/bcm/sha256-x86_64-linux.S
@@ -0,0 +1,4170 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.globl sha256_block_data_order_nohw
+.hidden sha256_block_data_order_nohw
+.type sha256_block_data_order_nohw,@function
+.align 16
+sha256_block_data_order_nohw:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $64+32,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
+ movl 0(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 4(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 8(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 12(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 16(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 20(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 24(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 28(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
+ movl 32(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 36(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 40(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 44(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 48(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 52(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 56(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 60(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movl 4(%rsp),%r13d
+ movl 56(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
+
+ addl 0(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 8(%rsp),%r13d
+ movl 60(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
+
+ addl 4(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 12(%rsp),%r13d
+ movl 0(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
+
+ addl 8(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 16(%rsp),%r13d
+ movl 4(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
+
+ addl 12(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 20(%rsp),%r13d
+ movl 8(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
+
+ addl 16(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 24(%rsp),%r13d
+ movl 12(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
+
+ addl 20(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 28(%rsp),%r13d
+ movl 16(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
+
+ addl 24(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 32(%rsp),%r13d
+ movl 20(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
+
+ addl 28(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ movl 36(%rsp),%r13d
+ movl 24(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
+
+ addl 32(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 40(%rsp),%r13d
+ movl 28(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
+
+ addl 36(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 44(%rsp),%r13d
+ movl 32(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
+
+ addl 40(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 48(%rsp),%r13d
+ movl 36(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
+
+ addl 44(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 52(%rsp),%r13d
+ movl 40(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
+
+ addl 48(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 56(%rsp),%r13d
+ movl 44(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
+
+ addl 52(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 60(%rsp),%r13d
+ movl 48(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
+
+ addl 56(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 0(%rsp),%r13d
+ movl 52(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
+
+ addl 60(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz .Lrounds_16_xx
+
+ movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
+ leaq 64(%rsi),%rsi
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ ret
+.cfi_endproc
+.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+.section .rodata
+.align 64
+.type K256,@object
+K256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+.globl sha256_block_data_order_hw
+.hidden sha256_block_data_order_hw
+.type sha256_block_data_order_hw,@function
+.align 64
+sha256_block_data_order_hw:
+.cfi_startproc
+_CET_ENDBR
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz .Loop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ ret
+.cfi_endproc
+.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
+.globl sha256_block_data_order_ssse3
+.hidden sha256_block_data_order_ssse3
+.type sha256_block_data_order_ssse3,@function
+.align 64
+sha256_block_data_order_ssse3:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+ movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_ssse3
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ ret
+.cfi_endproc
+.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.globl sha256_block_data_order_avx
+.hidden sha256_block_data_order_avx
+.type sha256_block_data_order_avx,@function
+.align 64
+sha256_block_data_order_avx:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_avx
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ ret
+.cfi_endproc
+.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
+#endif
diff --git a/gen/bcm/sha256-x86_64-win.asm b/gen/bcm/sha256-x86_64-win.asm
new file mode 100644
index 0000000..ada8dba
--- /dev/null
+++ b/gen/bcm/sha256-x86_64-win.asm
@@ -0,0 +1,4415 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+global sha256_block_data_order_nohw
+
+ALIGN 16
+sha256_block_data_order_nohw:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_nohw:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,16*4+4*8
+ lea rdx,[rdx*4+rsi]
+ and rsp,-64
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[88+rsp],rax
+
+$L$prologue:
+
+ mov eax,DWORD[rdi]
+ mov ebx,DWORD[4+rdi]
+ mov ecx,DWORD[8+rdi]
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+ jmp NEAR $L$loop
+
+ALIGN 16
+$L$loop:
+ mov edi,ebx
+ lea rbp,[K256]
+ xor edi,ecx
+ mov r12d,DWORD[rsi]
+ mov r13d,r8d
+ mov r14d,eax
+ bswap r12d
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ add r11d,r14d
+ mov r12d,DWORD[4+rsi]
+ mov r13d,edx
+ mov r14d,r11d
+ bswap r12d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[4+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ add r10d,r14d
+ mov r12d,DWORD[8+rsi]
+ mov r13d,ecx
+ mov r14d,r10d
+ bswap r12d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[8+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ add r9d,r14d
+ mov r12d,DWORD[12+rsi]
+ mov r13d,ebx
+ mov r14d,r9d
+ bswap r12d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[12+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ add r8d,r14d
+ mov r12d,DWORD[16+rsi]
+ mov r13d,eax
+ mov r14d,r8d
+ bswap r12d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[16+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ add edx,r14d
+ mov r12d,DWORD[20+rsi]
+ mov r13d,r11d
+ mov r14d,edx
+ bswap r12d
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[20+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ add ecx,r14d
+ mov r12d,DWORD[24+rsi]
+ mov r13d,r10d
+ mov r14d,ecx
+ bswap r12d
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[24+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ add ebx,r14d
+ mov r12d,DWORD[28+rsi]
+ mov r13d,r9d
+ mov r14d,ebx
+ bswap r12d
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[28+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ add eax,r14d
+ mov r12d,DWORD[32+rsi]
+ mov r13d,r8d
+ mov r14d,eax
+ bswap r12d
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[32+rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ add r11d,r14d
+ mov r12d,DWORD[36+rsi]
+ mov r13d,edx
+ mov r14d,r11d
+ bswap r12d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[36+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ add r10d,r14d
+ mov r12d,DWORD[40+rsi]
+ mov r13d,ecx
+ mov r14d,r10d
+ bswap r12d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[40+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ add r9d,r14d
+ mov r12d,DWORD[44+rsi]
+ mov r13d,ebx
+ mov r14d,r9d
+ bswap r12d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[44+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ add r8d,r14d
+ mov r12d,DWORD[48+rsi]
+ mov r13d,eax
+ mov r14d,r8d
+ bswap r12d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[48+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ add edx,r14d
+ mov r12d,DWORD[52+rsi]
+ mov r13d,r11d
+ mov r14d,edx
+ bswap r12d
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[52+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ add ecx,r14d
+ mov r12d,DWORD[56+rsi]
+ mov r13d,r10d
+ mov r14d,ecx
+ bswap r12d
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[56+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ add ebx,r14d
+ mov r12d,DWORD[60+rsi]
+ mov r13d,r9d
+ mov r14d,ebx
+ bswap r12d
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[60+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ jmp NEAR $L$rounds_16_xx
+ALIGN 16
+$L$rounds_16_xx:
+ mov r13d,DWORD[4+rsp]
+ mov r15d,DWORD[56+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add eax,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[36+rsp]
+
+ add r12d,DWORD[rsp]
+ mov r13d,r8d
+ add r12d,r15d
+ mov r14d,eax
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[8+rsp]
+ mov edi,DWORD[60+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r11d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[40+rsp]
+
+ add r12d,DWORD[4+rsp]
+ mov r13d,edx
+ add r12d,edi
+ mov r14d,r11d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[4+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[12+rsp]
+ mov r15d,DWORD[rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r10d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[44+rsp]
+
+ add r12d,DWORD[8+rsp]
+ mov r13d,ecx
+ add r12d,r15d
+ mov r14d,r10d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[8+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[16+rsp]
+ mov edi,DWORD[4+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r9d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[48+rsp]
+
+ add r12d,DWORD[12+rsp]
+ mov r13d,ebx
+ add r12d,edi
+ mov r14d,r9d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[12+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ mov r13d,DWORD[20+rsp]
+ mov r15d,DWORD[8+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r8d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[52+rsp]
+
+ add r12d,DWORD[16+rsp]
+ mov r13d,eax
+ add r12d,r15d
+ mov r14d,r8d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[16+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[24+rsp]
+ mov edi,DWORD[12+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add edx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[56+rsp]
+
+ add r12d,DWORD[20+rsp]
+ mov r13d,r11d
+ add r12d,edi
+ mov r14d,edx
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[20+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[28+rsp]
+ mov r15d,DWORD[16+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ecx,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[60+rsp]
+
+ add r12d,DWORD[24+rsp]
+ mov r13d,r10d
+ add r12d,r15d
+ mov r14d,ecx
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[24+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[32+rsp]
+ mov edi,DWORD[20+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ebx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[rsp]
+
+ add r12d,DWORD[28+rsp]
+ mov r13d,r9d
+ add r12d,edi
+ mov r14d,ebx
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[28+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ mov r13d,DWORD[36+rsp]
+ mov r15d,DWORD[24+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add eax,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[4+rsp]
+
+ add r12d,DWORD[32+rsp]
+ mov r13d,r8d
+ add r12d,r15d
+ mov r14d,eax
+ ror r13d,14
+ mov r15d,r9d
+
+ xor r13d,r8d
+ ror r14d,9
+ xor r15d,r10d
+
+ mov DWORD[32+rsp],r12d
+ xor r14d,eax
+ and r15d,r8d
+
+ ror r13d,5
+ add r12d,r11d
+ xor r15d,r10d
+
+ ror r14d,11
+ xor r13d,r8d
+ add r12d,r15d
+
+ mov r15d,eax
+ add r12d,DWORD[rbp]
+ xor r14d,eax
+
+ xor r15d,ebx
+ ror r13d,6
+ mov r11d,ebx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r11d,edi
+ add edx,r12d
+ add r11d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[40+rsp]
+ mov edi,DWORD[28+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r11d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[8+rsp]
+
+ add r12d,DWORD[36+rsp]
+ mov r13d,edx
+ add r12d,edi
+ mov r14d,r11d
+ ror r13d,14
+ mov edi,r8d
+
+ xor r13d,edx
+ ror r14d,9
+ xor edi,r9d
+
+ mov DWORD[36+rsp],r12d
+ xor r14d,r11d
+ and edi,edx
+
+ ror r13d,5
+ add r12d,r10d
+ xor edi,r9d
+
+ ror r14d,11
+ xor r13d,edx
+ add r12d,edi
+
+ mov edi,r11d
+ add r12d,DWORD[rbp]
+ xor r14d,r11d
+
+ xor edi,eax
+ ror r13d,6
+ mov r10d,eax
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r10d,r15d
+ add ecx,r12d
+ add r10d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[44+rsp]
+ mov r15d,DWORD[32+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r10d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[12+rsp]
+
+ add r12d,DWORD[40+rsp]
+ mov r13d,ecx
+ add r12d,r15d
+ mov r14d,r10d
+ ror r13d,14
+ mov r15d,edx
+
+ xor r13d,ecx
+ ror r14d,9
+ xor r15d,r8d
+
+ mov DWORD[40+rsp],r12d
+ xor r14d,r10d
+ and r15d,ecx
+
+ ror r13d,5
+ add r12d,r9d
+ xor r15d,r8d
+
+ ror r14d,11
+ xor r13d,ecx
+ add r12d,r15d
+
+ mov r15d,r10d
+ add r12d,DWORD[rbp]
+ xor r14d,r10d
+
+ xor r15d,r11d
+ ror r13d,6
+ mov r9d,r11d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor r9d,edi
+ add ebx,r12d
+ add r9d,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[48+rsp]
+ mov edi,DWORD[36+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r9d,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[16+rsp]
+
+ add r12d,DWORD[44+rsp]
+ mov r13d,ebx
+ add r12d,edi
+ mov r14d,r9d
+ ror r13d,14
+ mov edi,ecx
+
+ xor r13d,ebx
+ ror r14d,9
+ xor edi,edx
+
+ mov DWORD[44+rsp],r12d
+ xor r14d,r9d
+ and edi,ebx
+
+ ror r13d,5
+ add r12d,r8d
+ xor edi,edx
+
+ ror r14d,11
+ xor r13d,ebx
+ add r12d,edi
+
+ mov edi,r9d
+ add r12d,DWORD[rbp]
+ xor r14d,r9d
+
+ xor edi,r10d
+ ror r13d,6
+ mov r8d,r10d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor r8d,r15d
+ add eax,r12d
+ add r8d,r12d
+
+ lea rbp,[20+rbp]
+ mov r13d,DWORD[52+rsp]
+ mov r15d,DWORD[40+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add r8d,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[20+rsp]
+
+ add r12d,DWORD[48+rsp]
+ mov r13d,eax
+ add r12d,r15d
+ mov r14d,r8d
+ ror r13d,14
+ mov r15d,ebx
+
+ xor r13d,eax
+ ror r14d,9
+ xor r15d,ecx
+
+ mov DWORD[48+rsp],r12d
+ xor r14d,r8d
+ and r15d,eax
+
+ ror r13d,5
+ add r12d,edx
+ xor r15d,ecx
+
+ ror r14d,11
+ xor r13d,eax
+ add r12d,r15d
+
+ mov r15d,r8d
+ add r12d,DWORD[rbp]
+ xor r14d,r8d
+
+ xor r15d,r9d
+ ror r13d,6
+ mov edx,r9d
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor edx,edi
+ add r11d,r12d
+ add edx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[56+rsp]
+ mov edi,DWORD[44+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add edx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[24+rsp]
+
+ add r12d,DWORD[52+rsp]
+ mov r13d,r11d
+ add r12d,edi
+ mov r14d,edx
+ ror r13d,14
+ mov edi,eax
+
+ xor r13d,r11d
+ ror r14d,9
+ xor edi,ebx
+
+ mov DWORD[52+rsp],r12d
+ xor r14d,edx
+ and edi,r11d
+
+ ror r13d,5
+ add r12d,ecx
+ xor edi,ebx
+
+ ror r14d,11
+ xor r13d,r11d
+ add r12d,edi
+
+ mov edi,edx
+ add r12d,DWORD[rbp]
+ xor r14d,edx
+
+ xor edi,r8d
+ ror r13d,6
+ mov ecx,r8d
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor ecx,r15d
+ add r10d,r12d
+ add ecx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[60+rsp]
+ mov r15d,DWORD[48+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ecx,r14d
+ mov r14d,r15d
+ ror r15d,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor r15d,r14d
+ shr r14d,10
+
+ ror r15d,17
+ xor r12d,r13d
+ xor r15d,r14d
+ add r12d,DWORD[28+rsp]
+
+ add r12d,DWORD[56+rsp]
+ mov r13d,r10d
+ add r12d,r15d
+ mov r14d,ecx
+ ror r13d,14
+ mov r15d,r11d
+
+ xor r13d,r10d
+ ror r14d,9
+ xor r15d,eax
+
+ mov DWORD[56+rsp],r12d
+ xor r14d,ecx
+ and r15d,r10d
+
+ ror r13d,5
+ add r12d,ebx
+ xor r15d,eax
+
+ ror r14d,11
+ xor r13d,r10d
+ add r12d,r15d
+
+ mov r15d,ecx
+ add r12d,DWORD[rbp]
+ xor r14d,ecx
+
+ xor r15d,edx
+ ror r13d,6
+ mov ebx,edx
+
+ and edi,r15d
+ ror r14d,2
+ add r12d,r13d
+
+ xor ebx,edi
+ add r9d,r12d
+ add ebx,r12d
+
+ lea rbp,[4+rbp]
+ mov r13d,DWORD[rsp]
+ mov edi,DWORD[52+rsp]
+
+ mov r12d,r13d
+ ror r13d,11
+ add ebx,r14d
+ mov r14d,edi
+ ror edi,2
+
+ xor r13d,r12d
+ shr r12d,3
+ ror r13d,7
+ xor edi,r14d
+ shr r14d,10
+
+ ror edi,17
+ xor r12d,r13d
+ xor edi,r14d
+ add r12d,DWORD[32+rsp]
+
+ add r12d,DWORD[60+rsp]
+ mov r13d,r9d
+ add r12d,edi
+ mov r14d,ebx
+ ror r13d,14
+ mov edi,r10d
+
+ xor r13d,r9d
+ ror r14d,9
+ xor edi,r11d
+
+ mov DWORD[60+rsp],r12d
+ xor r14d,ebx
+ and edi,r9d
+
+ ror r13d,5
+ add r12d,eax
+ xor edi,r11d
+
+ ror r14d,11
+ xor r13d,r9d
+ add r12d,edi
+
+ mov edi,ebx
+ add r12d,DWORD[rbp]
+ xor r14d,ebx
+
+ xor edi,ecx
+ ror r13d,6
+ mov eax,ecx
+
+ and r15d,edi
+ ror r14d,2
+ add r12d,r13d
+
+ xor eax,r15d
+ add r8d,r12d
+ add eax,r12d
+
+ lea rbp,[20+rbp]
+ cmp BYTE[3+rbp],0
+ jnz NEAR $L$rounds_16_xx
+
+ mov rdi,QWORD[((64+0))+rsp]
+ add eax,r14d
+ lea rsi,[64+rsi]
+
+ add eax,DWORD[rdi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ jb NEAR $L$loop
+
+ mov rsi,QWORD[88+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha256_block_data_order_nohw:
+section .rdata rdata align=8
+ALIGN 64
+
+K256:
+ DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+ DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+ DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+ DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+ DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+ DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+ DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+ DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+ DB 111,114,103,62,0
+section .text
+
+global sha256_block_data_order_hw
+
+ALIGN 64
+sha256_block_data_order_hw:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_hw:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rsp,[((-88))+rsp]
+ movaps XMMWORD[(-8-80)+rax],xmm6
+ movaps XMMWORD[(-8-64)+rax],xmm7
+ movaps XMMWORD[(-8-48)+rax],xmm8
+ movaps XMMWORD[(-8-32)+rax],xmm9
+ movaps XMMWORD[(-8-16)+rax],xmm10
+$L$prologue_shaext:
+ lea rcx,[((K256+128))]
+ movdqu xmm1,XMMWORD[rdi]
+ movdqu xmm2,XMMWORD[16+rdi]
+ movdqa xmm7,XMMWORD[((512-128))+rcx]
+
+ pshufd xmm0,xmm1,0x1b
+ pshufd xmm1,xmm1,0xb1
+ pshufd xmm2,xmm2,0x1b
+ movdqa xmm8,xmm7
+DB 102,15,58,15,202,8
+ punpcklqdq xmm2,xmm0
+ jmp NEAR $L$oop_shaext
+
+ALIGN 16
+$L$oop_shaext:
+ movdqu xmm3,XMMWORD[rsi]
+ movdqu xmm4,XMMWORD[16+rsi]
+ movdqu xmm5,XMMWORD[32+rsi]
+DB 102,15,56,0,223
+ movdqu xmm6,XMMWORD[48+rsi]
+
+ movdqa xmm0,XMMWORD[((0-128))+rcx]
+ paddd xmm0,xmm3
+DB 102,15,56,0,231
+ movdqa xmm10,xmm2
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ nop
+ movdqa xmm9,xmm1
+ DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((32-128))+rcx]
+ paddd xmm0,xmm4
+DB 102,15,56,0,239
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ lea rsi,[64+rsi]
+ DB 15,56,204,220
+ DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((64-128))+rcx]
+ paddd xmm0,xmm5
+DB 102,15,56,0,247
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+ DB 15,56,204,229
+ DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((96-128))+rcx]
+ paddd xmm0,xmm6
+ DB 15,56,205,222
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+ DB 15,56,204,238
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((128-128))+rcx]
+ paddd xmm0,xmm3
+ DB 15,56,205,227
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+ DB 15,56,204,243
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((160-128))+rcx]
+ paddd xmm0,xmm4
+ DB 15,56,205,236
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+ nop
+ paddd xmm6,xmm7
+ DB 15,56,204,220
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((192-128))+rcx]
+ paddd xmm0,xmm5
+ DB 15,56,205,245
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+ DB 15,56,204,229
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((224-128))+rcx]
+ paddd xmm0,xmm6
+ DB 15,56,205,222
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+ DB 15,56,204,238
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((256-128))+rcx]
+ paddd xmm0,xmm3
+ DB 15,56,205,227
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+ DB 15,56,204,243
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((288-128))+rcx]
+ paddd xmm0,xmm4
+ DB 15,56,205,236
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+ nop
+ paddd xmm6,xmm7
+ DB 15,56,204,220
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((320-128))+rcx]
+ paddd xmm0,xmm5
+ DB 15,56,205,245
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+ DB 15,56,204,229
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((352-128))+rcx]
+ paddd xmm0,xmm6
+ DB 15,56,205,222
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+ DB 15,56,204,238
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((384-128))+rcx]
+ paddd xmm0,xmm3
+ DB 15,56,205,227
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+ DB 15,56,204,243
+ DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((416-128))+rcx]
+ paddd xmm0,xmm4
+ DB 15,56,205,236
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+ DB 15,56,203,202
+ paddd xmm6,xmm7
+
+ movdqa xmm0,XMMWORD[((448-128))+rcx]
+ paddd xmm0,xmm5
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ DB 15,56,205,245
+ movdqa xmm7,xmm8
+ DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((480-128))+rcx]
+ paddd xmm0,xmm6
+ nop
+ DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ dec rdx
+ nop
+ DB 15,56,203,202
+
+ paddd xmm2,xmm10
+ paddd xmm1,xmm9
+ jnz NEAR $L$oop_shaext
+
+ pshufd xmm2,xmm2,0xb1
+ pshufd xmm7,xmm1,0x1b
+ pshufd xmm1,xmm1,0xb1
+ punpckhqdq xmm1,xmm2
+DB 102,15,58,15,215,8
+
+ movdqu XMMWORD[rdi],xmm1
+ movdqu XMMWORD[16+rdi],xmm2
+ movaps xmm6,XMMWORD[((-8-80))+rax]
+ movaps xmm7,XMMWORD[((-8-64))+rax]
+ movaps xmm8,XMMWORD[((-8-48))+rax]
+ movaps xmm9,XMMWORD[((-8-32))+rax]
+ movaps xmm10,XMMWORD[((-8-16))+rax]
+ mov rsp,rax
+$L$epilogue_shaext:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha256_block_data_order_hw:
+global sha256_block_data_order_ssse3
+
+ALIGN 64
+sha256_block_data_order_ssse3:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_ssse3:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,160
+ lea rdx,[rdx*4+rsi]
+ and rsp,-64
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[88+rsp],rax
+
+ movaps XMMWORD[(64+32)+rsp],xmm6
+ movaps XMMWORD[(64+48)+rsp],xmm7
+ movaps XMMWORD[(64+64)+rsp],xmm8
+ movaps XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_ssse3:
+
+ mov eax,DWORD[rdi]
+ mov ebx,DWORD[4+rdi]
+ mov ecx,DWORD[8+rdi]
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+
+
+ jmp NEAR $L$loop_ssse3
+ALIGN 16
+$L$loop_ssse3:
+ movdqa xmm7,XMMWORD[((K256+512))]
+ movdqu xmm0,XMMWORD[rsi]
+ movdqu xmm1,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+DB 102,15,56,0,199
+ movdqu xmm3,XMMWORD[48+rsi]
+ lea rbp,[K256]
+DB 102,15,56,0,207
+ movdqa xmm4,XMMWORD[rbp]
+ movdqa xmm5,XMMWORD[32+rbp]
+DB 102,15,56,0,215
+ paddd xmm4,xmm0
+ movdqa xmm6,XMMWORD[64+rbp]
+DB 102,15,56,0,223
+ movdqa xmm7,XMMWORD[96+rbp]
+ paddd xmm5,xmm1
+ paddd xmm6,xmm2
+ paddd xmm7,xmm3
+ movdqa XMMWORD[rsp],xmm4
+ mov r14d,eax
+ movdqa XMMWORD[16+rsp],xmm5
+ mov edi,ebx
+ movdqa XMMWORD[32+rsp],xmm6
+ xor edi,ecx
+ movdqa XMMWORD[48+rsp],xmm7
+ mov r13d,r8d
+ jmp NEAR $L$ssse3_00_47
+
+ALIGN 16
+$L$ssse3_00_47:
+ sub rbp,-128
+ ror r13d,14
+ movdqa xmm4,xmm1
+ mov eax,r14d
+ mov r12d,r9d
+ movdqa xmm7,xmm3
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+DB 102,15,58,15,224,4
+ and r12d,r8d
+ xor r13d,r8d
+DB 102,15,58,15,250,4
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,ebx
+ add r11d,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ paddd xmm0,xmm7
+ ror r14d,2
+ add edx,r11d
+ psrld xmm6,7
+ add r11d,edi
+ mov r13d,edx
+ pshufd xmm7,xmm3,250
+ add r14d,r11d
+ ror r13d,14
+ pslld xmm5,14
+ mov r11d,r14d
+ mov r12d,r8d
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,r11d
+ pxor xmm4,xmm5
+ and r12d,edx
+ xor r13d,edx
+ pslld xmm5,11
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ pxor xmm4,xmm6
+ xor r12d,r9d
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,eax
+ add r10d,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ psrld xmm7,10
+ add r10d,r13d
+ xor r15d,eax
+ paddd xmm0,xmm4
+ ror r14d,2
+ add ecx,r10d
+ psrlq xmm6,17
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,ecx
+ xor r12d,r8d
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ pshufd xmm7,xmm7,128
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ mov r15d,r10d
+ psrldq xmm7,8
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ paddd xmm0,xmm7
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ pshufd xmm7,xmm0,80
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ movdqa xmm6,xmm7
+ add r9d,edi
+ mov r13d,ebx
+ psrld xmm7,10
+ add r14d,r9d
+ ror r13d,14
+ psrlq xmm6,17
+ mov r9d,r14d
+ mov r12d,ecx
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ psrlq xmm6,2
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ pxor xmm7,xmm6
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,r10d
+ add r8d,r12d
+ movdqa xmm6,XMMWORD[rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ paddd xmm0,xmm7
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ paddd xmm6,xmm0
+ mov r13d,eax
+ add r14d,r8d
+ movdqa XMMWORD[rsp],xmm6
+ ror r13d,14
+ movdqa xmm4,xmm2
+ mov r8d,r14d
+ mov r12d,ebx
+ movdqa xmm7,xmm0
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+DB 102,15,58,15,225,4
+ and r12d,eax
+ xor r13d,eax
+DB 102,15,58,15,251,4
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,r9d
+ add edx,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ paddd xmm1,xmm7
+ ror r14d,2
+ add r11d,edx
+ psrld xmm6,7
+ add edx,edi
+ mov r13d,r11d
+ pshufd xmm7,xmm0,250
+ add r14d,edx
+ ror r13d,14
+ pslld xmm5,14
+ mov edx,r14d
+ mov r12d,eax
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,edx
+ pxor xmm4,xmm5
+ and r12d,r11d
+ xor r13d,r11d
+ pslld xmm5,11
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ pxor xmm4,xmm6
+ xor r12d,ebx
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,r8d
+ add ecx,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ psrld xmm7,10
+ add ecx,r13d
+ xor r15d,r8d
+ paddd xmm1,xmm4
+ ror r14d,2
+ add r10d,ecx
+ psrlq xmm6,17
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,r10d
+ xor r12d,eax
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ pshufd xmm7,xmm7,128
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ mov r15d,ecx
+ psrldq xmm7,8
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ paddd xmm1,xmm7
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ pshufd xmm7,xmm1,80
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ movdqa xmm6,xmm7
+ add ebx,edi
+ mov r13d,r9d
+ psrld xmm7,10
+ add r14d,ebx
+ ror r13d,14
+ psrlq xmm6,17
+ mov ebx,r14d
+ mov r12d,r10d
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ psrlq xmm6,2
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ pxor xmm7,xmm6
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,ecx
+ add eax,r12d
+ movdqa xmm6,XMMWORD[32+rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ paddd xmm1,xmm7
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ paddd xmm6,xmm1
+ mov r13d,r8d
+ add r14d,eax
+ movdqa XMMWORD[16+rsp],xmm6
+ ror r13d,14
+ movdqa xmm4,xmm3
+ mov eax,r14d
+ mov r12d,r9d
+ movdqa xmm7,xmm1
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+DB 102,15,58,15,226,4
+ and r12d,r8d
+ xor r13d,r8d
+DB 102,15,58,15,248,4
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,ebx
+ add r11d,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ paddd xmm2,xmm7
+ ror r14d,2
+ add edx,r11d
+ psrld xmm6,7
+ add r11d,edi
+ mov r13d,edx
+ pshufd xmm7,xmm1,250
+ add r14d,r11d
+ ror r13d,14
+ pslld xmm5,14
+ mov r11d,r14d
+ mov r12d,r8d
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,r11d
+ pxor xmm4,xmm5
+ and r12d,edx
+ xor r13d,edx
+ pslld xmm5,11
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ pxor xmm4,xmm6
+ xor r12d,r9d
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,eax
+ add r10d,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ psrld xmm7,10
+ add r10d,r13d
+ xor r15d,eax
+ paddd xmm2,xmm4
+ ror r14d,2
+ add ecx,r10d
+ psrlq xmm6,17
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,ecx
+ xor r12d,r8d
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ pshufd xmm7,xmm7,128
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ mov r15d,r10d
+ psrldq xmm7,8
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ paddd xmm2,xmm7
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ pshufd xmm7,xmm2,80
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ movdqa xmm6,xmm7
+ add r9d,edi
+ mov r13d,ebx
+ psrld xmm7,10
+ add r14d,r9d
+ ror r13d,14
+ psrlq xmm6,17
+ mov r9d,r14d
+ mov r12d,ecx
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ psrlq xmm6,2
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ pxor xmm7,xmm6
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,r10d
+ add r8d,r12d
+ movdqa xmm6,XMMWORD[64+rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ paddd xmm2,xmm7
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ paddd xmm6,xmm2
+ mov r13d,eax
+ add r14d,r8d
+ movdqa XMMWORD[32+rsp],xmm6
+ ror r13d,14
+ movdqa xmm4,xmm0
+ mov r8d,r14d
+ mov r12d,ebx
+ movdqa xmm7,xmm2
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+DB 102,15,58,15,227,4
+ and r12d,eax
+ xor r13d,eax
+DB 102,15,58,15,249,4
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ movdqa xmm5,xmm4
+ xor r15d,r9d
+ add edx,r12d
+ movdqa xmm6,xmm4
+ ror r13d,6
+ and edi,r15d
+ psrld xmm4,3
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ paddd xmm3,xmm7
+ ror r14d,2
+ add r11d,edx
+ psrld xmm6,7
+ add edx,edi
+ mov r13d,r11d
+ pshufd xmm7,xmm2,250
+ add r14d,edx
+ ror r13d,14
+ pslld xmm5,14
+ mov edx,r14d
+ mov r12d,eax
+ pxor xmm4,xmm6
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ psrld xmm6,11
+ xor r14d,edx
+ pxor xmm4,xmm5
+ and r12d,r11d
+ xor r13d,r11d
+ pslld xmm5,11
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ pxor xmm4,xmm6
+ xor r12d,ebx
+ ror r14d,11
+ movdqa xmm6,xmm7
+ xor edi,r8d
+ add ecx,r12d
+ pxor xmm4,xmm5
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ psrld xmm7,10
+ add ecx,r13d
+ xor r15d,r8d
+ paddd xmm3,xmm4
+ ror r14d,2
+ add r10d,ecx
+ psrlq xmm6,17
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ pxor xmm7,xmm6
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ psrlq xmm6,2
+ xor r13d,r10d
+ xor r12d,eax
+ pxor xmm7,xmm6
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ pshufd xmm7,xmm7,128
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ mov r15d,ecx
+ psrldq xmm7,8
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ paddd xmm3,xmm7
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ pshufd xmm7,xmm3,80
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ movdqa xmm6,xmm7
+ add ebx,edi
+ mov r13d,r9d
+ psrld xmm7,10
+ add r14d,ebx
+ ror r13d,14
+ psrlq xmm6,17
+ mov ebx,r14d
+ mov r12d,r10d
+ pxor xmm7,xmm6
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ psrlq xmm6,2
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ pxor xmm7,xmm6
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ pshufd xmm7,xmm7,8
+ xor edi,ecx
+ add eax,r12d
+ movdqa xmm6,XMMWORD[96+rbp]
+ ror r13d,6
+ and r15d,edi
+ pslldq xmm7,8
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ paddd xmm3,xmm7
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ paddd xmm6,xmm3
+ mov r13d,r8d
+ add r14d,eax
+ movdqa XMMWORD[48+rsp],xmm6
+ cmp BYTE[131+rbp],0
+ jne NEAR $L$ssse3_00_47
+ ror r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ ror r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ ror r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ ror r14d,11
+ xor edi,eax
+ add r10d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ ror r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ ror r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ ror r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ ror r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ ror r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ ror r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ ror r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ ror r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ xor edi,ecx
+ add eax,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ ror r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ ror r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ ror r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ ror r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ ror r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ ror r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ ror r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ ror r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ ror r14d,11
+ xor edi,eax
+ add r10d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ ror r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ ror r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ ror r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ ror r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ ror r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ ror r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ ror r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ ror r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ ror r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ ror r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ ror r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ ror r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ ror r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ ror r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ ror r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ ror r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ ror r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ ror r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ ror r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ ror r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ ror r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ ror r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ ror r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ ror r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ ror r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ ror r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ ror r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ ror r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ ror r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ ror r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ ror r14d,11
+ xor edi,ecx
+ add eax,r12d
+ ror r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ ror r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ mov rdi,QWORD[((64+0))+rsp]
+ mov eax,r14d
+
+ add eax,DWORD[rdi]
+ lea rsi,[64+rsi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ jb NEAR $L$loop_ssse3
+
+ mov rsi,QWORD[88+rsp]
+
+ movaps xmm6,XMMWORD[((64+32))+rsp]
+ movaps xmm7,XMMWORD[((64+48))+rsp]
+ movaps xmm8,XMMWORD[((64+64))+rsp]
+ movaps xmm9,XMMWORD[((64+80))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_ssse3:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha256_block_data_order_ssse3:
+global sha256_block_data_order_avx
+
+ALIGN 64
+sha256_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,160
+ lea rdx,[rdx*4+rsi]
+ and rsp,-64
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[88+rsp],rax
+
+ movaps XMMWORD[(64+32)+rsp],xmm6
+ movaps XMMWORD[(64+48)+rsp],xmm7
+ movaps XMMWORD[(64+64)+rsp],xmm8
+ movaps XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_avx:
+
+ vzeroupper
+ mov eax,DWORD[rdi]
+ mov ebx,DWORD[4+rdi]
+ mov ecx,DWORD[8+rdi]
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+ vmovdqa xmm8,XMMWORD[((K256+512+32))]
+ vmovdqa xmm9,XMMWORD[((K256+512+64))]
+ jmp NEAR $L$loop_avx
+ALIGN 16
+$L$loop_avx:
+ vmovdqa xmm7,XMMWORD[((K256+512))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm0,xmm0,xmm7
+ lea rbp,[K256]
+ vpshufb xmm1,xmm1,xmm7
+ vpshufb xmm2,xmm2,xmm7
+ vpaddd xmm4,xmm0,XMMWORD[rbp]
+ vpshufb xmm3,xmm3,xmm7
+ vpaddd xmm5,xmm1,XMMWORD[32+rbp]
+ vpaddd xmm6,xmm2,XMMWORD[64+rbp]
+ vpaddd xmm7,xmm3,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[rsp],xmm4
+ mov r14d,eax
+ vmovdqa XMMWORD[16+rsp],xmm5
+ mov edi,ebx
+ vmovdqa XMMWORD[32+rsp],xmm6
+ xor edi,ecx
+ vmovdqa XMMWORD[48+rsp],xmm7
+ mov r13d,r8d
+ jmp NEAR $L$avx_00_47
+
+ALIGN 16
+$L$avx_00_47:
+ sub rbp,-128
+ vpalignr xmm4,xmm1,xmm0,4
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ vpalignr xmm7,xmm3,xmm2,4
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ vpaddd xmm0,xmm0,xmm7
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ vpsrld xmm7,xmm4,3
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ vpslld xmm5,xmm4,14
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ vpshufd xmm7,xmm3,250
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ vpsrld xmm6,xmm7,10
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ vpaddd xmm0,xmm0,xmm4
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ vpsrlq xmm7,xmm7,2
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ vpaddd xmm0,xmm0,xmm6
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ vpshufd xmm7,xmm0,80
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ vpsrlq xmm7,xmm7,2
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ vpaddd xmm0,xmm0,xmm6
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ vpaddd xmm6,xmm0,XMMWORD[rbp]
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ vmovdqa XMMWORD[rsp],xmm6
+ vpalignr xmm4,xmm2,xmm1,4
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ vpalignr xmm7,xmm0,xmm3,4
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ vpaddd xmm1,xmm1,xmm7
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ vpsrld xmm7,xmm4,3
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ vpslld xmm5,xmm4,14
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ vpshufd xmm7,xmm0,250
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ vpsrld xmm6,xmm7,10
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ vpaddd xmm1,xmm1,xmm4
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ vpsrlq xmm7,xmm7,2
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ vpaddd xmm1,xmm1,xmm6
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ vpshufd xmm7,xmm1,80
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ vpsrlq xmm7,xmm7,2
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ vpaddd xmm1,xmm1,xmm6
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ vpaddd xmm6,xmm1,XMMWORD[32+rbp]
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ vmovdqa XMMWORD[16+rsp],xmm6
+ vpalignr xmm4,xmm3,xmm2,4
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ vpalignr xmm7,xmm1,xmm0,4
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ vpaddd xmm2,xmm2,xmm7
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ vpsrld xmm7,xmm4,3
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ vpslld xmm5,xmm4,14
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ vpshufd xmm7,xmm1,250
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ vpsrld xmm6,xmm7,10
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ vpaddd xmm2,xmm2,xmm4
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ vpsrlq xmm7,xmm7,2
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ vpaddd xmm2,xmm2,xmm6
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ vpshufd xmm7,xmm2,80
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ vpsrlq xmm7,xmm7,2
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ vpaddd xmm2,xmm2,xmm6
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ vpaddd xmm6,xmm2,XMMWORD[64+rbp]
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ vmovdqa XMMWORD[32+rsp],xmm6
+ vpalignr xmm4,xmm0,xmm3,4
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ vpalignr xmm7,xmm2,xmm1,4
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ vpaddd xmm3,xmm3,xmm7
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ vpsrld xmm7,xmm4,3
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ vpslld xmm5,xmm4,14
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ vpshufd xmm7,xmm2,250
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ vpsrld xmm6,xmm7,10
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ vpaddd xmm3,xmm3,xmm4
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ vpsrlq xmm7,xmm7,2
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ vpaddd xmm3,xmm3,xmm6
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ vpshufd xmm7,xmm3,80
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ vpsrlq xmm7,xmm7,2
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ vpaddd xmm3,xmm3,xmm6
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ vpaddd xmm6,xmm3,XMMWORD[96+rbp]
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ vmovdqa XMMWORD[48+rsp],xmm6
+ cmp BYTE[131+rbp],0
+ jne NEAR $L$avx_00_47
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ mov rdi,QWORD[((64+0))+rsp]
+ mov eax,r14d
+
+ add eax,DWORD[rdi]
+ lea rsi,[64+rsi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ jb NEAR $L$loop_avx
+
+ mov rsi,QWORD[88+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((64+32))+rsp]
+ movaps xmm7,XMMWORD[((64+48))+rsp]
+ movaps xmm8,XMMWORD[((64+64))+rsp]
+ movaps xmm9,XMMWORD[((64+80))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha256_block_data_order_avx:
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+ mov rsi,rax
+ mov rax,QWORD[((64+24))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ lea rsi,[((64+32))+rsi]
+ lea rdi,[512+r8]
+ mov ecx,8
+ DD 0xa548f3fc
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+ALIGN 16
+shaext_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue_shaext]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ lea r10,[$L$epilogue_shaext]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ lea rsi,[((-8-80))+rax]
+ lea rdi,[512+r8]
+ mov ecx,10
+ DD 0xa548f3fc
+
+ jmp NEAR $L$in_prologue
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha256_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_hw wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_hw wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_hw wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha256_block_data_order_nohw:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_hw:
+ DB 9,0,0,0
+ DD shaext_handler wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_ssse3:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_avx:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha512-586-apple.S b/gen/bcm/sha512-586-apple.S
new file mode 100644
index 0000000..cfdeac1
--- /dev/null
+++ b/gen/bcm/sha512-586-apple.S
@@ -0,0 +1,2837 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _sha512_block_data_order
+.private_extern _sha512_block_data_order
+.align 4
+_sha512_block_data_order:
+L_sha512_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call L000pic_point
+L000pic_point:
+ popl %ebp
+ leal L001K512-L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $7,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx
+ movl (%edx),%ecx
+ testl $67108864,%ecx
+ jz L002loop_x86
+ movl 4(%edx),%edx
+ movq (%esi),%mm0
+ andl $16777216,%ecx
+ movq 8(%esi),%mm1
+ andl $512,%edx
+ movq 16(%esi),%mm2
+ orl %edx,%ecx
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ cmpl $16777728,%ecx
+ je L003SSSE3
+ subl $80,%esp
+ jmp L004loop_sse2
+.align 4,0x90
+L004loop_sse2:
+ movq %mm1,8(%esp)
+ movq %mm2,16(%esp)
+ movq %mm3,24(%esp)
+ movq %mm5,40(%esp)
+ movq %mm6,48(%esp)
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ movq %mm0,%mm3
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ movl $15,%edx
+ bswap %eax
+ bswap %ebx
+ jmp L00500_14_sse2
+.align 4,0x90
+L00500_14_sse2:
+ movd %eax,%mm1
+ movl (%edi),%eax
+ movd %ebx,%mm7
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ bswap %eax
+ bswap %ebx
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ movq 48(%esp),%mm6
+ decl %edx
+ jnz L00500_14_sse2
+ movd %eax,%mm1
+ movd %ebx,%mm7
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ pxor %mm0,%mm0
+ movl $32,%edx
+ jmp L00616_79_sse2
+.align 4,0x90
+L00616_79_sse2:
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm0
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm2
+ addl $8,%ebp
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm2
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm0
+ addl $8,%ebp
+ decl %edx
+ jnz L00616_79_sse2
+ paddq %mm3,%mm0
+ movq 8(%esp),%mm1
+ movq 24(%esp),%mm3
+ movq 40(%esp),%mm5
+ movq 48(%esp),%mm6
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movl $640,%eax
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ leal (%esp,%eax,1),%esp
+ subl %eax,%ebp
+ cmpl 88(%esp),%edi
+ jb L004loop_sse2
+ movl 92(%esp),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 5,0x90
+L003SSSE3:
+ leal -64(%esp),%edx
+ subl $256,%esp
+ movdqa 640(%ebp),%xmm1
+ movdqu (%edi),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%edi),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%edi),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%edi),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%edi),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%edi),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%edi),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%edi),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movdqa %xmm2,-16(%edx)
+ nop
+.align 5,0x90
+L007loop_ssse3:
+ movdqa 16(%edx),%xmm2
+ movdqa %xmm3,48(%edx)
+ leal 128(%ebp),%ebp
+ movq %mm1,8(%esp)
+ movl %edi,%ebx
+ movq %mm2,16(%esp)
+ leal 128(%edi),%edi
+ movq %mm3,24(%esp)
+ cmpl %eax,%edi
+ movq %mm5,40(%esp)
+ cmovbl %edi,%ebx
+ movq %mm6,48(%esp)
+ movl $4,%ecx
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ pxor %mm3,%mm3
+ jmp L00800_47_ssse3
+.align 5,0x90
+L00800_47_ssse3:
+ movdqa %xmm5,%xmm3
+ movdqa %xmm2,%xmm1
+.byte 102,15,58,15,208,8
+ movdqa %xmm4,(%edx)
+.byte 102,15,58,15,220,8
+ movdqa %xmm2,%xmm4
+ psrlq $7,%xmm2
+ paddq %xmm3,%xmm0
+ movdqa %xmm4,%xmm3
+ psrlq $1,%xmm4
+ psllq $56,%xmm3
+ pxor %xmm4,%xmm2
+ psrlq $7,%xmm4
+ pxor %xmm3,%xmm2
+ psllq $7,%xmm3
+ pxor %xmm4,%xmm2
+ movdqa %xmm7,%xmm4
+ pxor %xmm3,%xmm2
+ movdqa %xmm7,%xmm3
+ psrlq $6,%xmm4
+ paddq %xmm2,%xmm0
+ movdqa %xmm7,%xmm2
+ psrlq $19,%xmm3
+ psllq $3,%xmm2
+ pxor %xmm3,%xmm4
+ psrlq $42,%xmm3
+ pxor %xmm2,%xmm4
+ psllq $42,%xmm2
+ pxor %xmm3,%xmm4
+ movdqa 32(%edx),%xmm3
+ pxor %xmm2,%xmm4
+ movdqa (%ebp),%xmm2
+ movq %mm4,%mm1
+ paddq %xmm4,%xmm0
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm0,%xmm2
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm2,-128(%edx)
+ movdqa %xmm6,%xmm4
+ movdqa %xmm3,%xmm2
+.byte 102,15,58,15,217,8
+ movdqa %xmm5,16(%edx)
+.byte 102,15,58,15,229,8
+ movdqa %xmm3,%xmm5
+ psrlq $7,%xmm3
+ paddq %xmm4,%xmm1
+ movdqa %xmm5,%xmm4
+ psrlq $1,%xmm5
+ psllq $56,%xmm4
+ pxor %xmm5,%xmm3
+ psrlq $7,%xmm5
+ pxor %xmm4,%xmm3
+ psllq $7,%xmm4
+ pxor %xmm5,%xmm3
+ movdqa %xmm0,%xmm5
+ pxor %xmm4,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $6,%xmm5
+ paddq %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psrlq $19,%xmm4
+ psllq $3,%xmm3
+ pxor %xmm4,%xmm5
+ psrlq $42,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $42,%xmm3
+ pxor %xmm4,%xmm5
+ movdqa 48(%edx),%xmm4
+ pxor %xmm3,%xmm5
+ movdqa 16(%ebp),%xmm3
+ movq %mm4,%mm1
+ paddq %xmm5,%xmm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm1,%xmm3
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm3,-112(%edx)
+ movdqa %xmm7,%xmm5
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,226,8
+ movdqa %xmm6,32(%edx)
+.byte 102,15,58,15,238,8
+ movdqa %xmm4,%xmm6
+ psrlq $7,%xmm4
+ paddq %xmm5,%xmm2
+ movdqa %xmm6,%xmm5
+ psrlq $1,%xmm6
+ psllq $56,%xmm5
+ pxor %xmm6,%xmm4
+ psrlq $7,%xmm6
+ pxor %xmm5,%xmm4
+ psllq $7,%xmm5
+ pxor %xmm6,%xmm4
+ movdqa %xmm1,%xmm6
+ pxor %xmm5,%xmm4
+ movdqa %xmm1,%xmm5
+ psrlq $6,%xmm6
+ paddq %xmm4,%xmm2
+ movdqa %xmm1,%xmm4
+ psrlq $19,%xmm5
+ psllq $3,%xmm4
+ pxor %xmm5,%xmm6
+ psrlq $42,%xmm5
+ pxor %xmm4,%xmm6
+ psllq $42,%xmm4
+ pxor %xmm5,%xmm6
+ movdqa (%edx),%xmm5
+ pxor %xmm4,%xmm6
+ movdqa 32(%ebp),%xmm4
+ movq %mm4,%mm1
+ paddq %xmm6,%xmm2
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm2,%xmm4
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm4,-96(%edx)
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm4
+.byte 102,15,58,15,235,8
+ movdqa %xmm7,48(%edx)
+.byte 102,15,58,15,247,8
+ movdqa %xmm5,%xmm7
+ psrlq $7,%xmm5
+ paddq %xmm6,%xmm3
+ movdqa %xmm7,%xmm6
+ psrlq $1,%xmm7
+ psllq $56,%xmm6
+ pxor %xmm7,%xmm5
+ psrlq $7,%xmm7
+ pxor %xmm6,%xmm5
+ psllq $7,%xmm6
+ pxor %xmm7,%xmm5
+ movdqa %xmm2,%xmm7
+ pxor %xmm6,%xmm5
+ movdqa %xmm2,%xmm6
+ psrlq $6,%xmm7
+ paddq %xmm5,%xmm3
+ movdqa %xmm2,%xmm5
+ psrlq $19,%xmm6
+ psllq $3,%xmm5
+ pxor %xmm6,%xmm7
+ psrlq $42,%xmm6
+ pxor %xmm5,%xmm7
+ psllq $42,%xmm5
+ pxor %xmm6,%xmm7
+ movdqa 16(%edx),%xmm6
+ pxor %xmm5,%xmm7
+ movdqa 48(%ebp),%xmm5
+ movq %mm4,%mm1
+ paddq %xmm7,%xmm3
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm3,%xmm5
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm5,-80(%edx)
+ movdqa %xmm1,%xmm7
+ movdqa %xmm6,%xmm5
+.byte 102,15,58,15,244,8
+ movdqa %xmm0,(%edx)
+.byte 102,15,58,15,248,8
+ movdqa %xmm6,%xmm0
+ psrlq $7,%xmm6
+ paddq %xmm7,%xmm4
+ movdqa %xmm0,%xmm7
+ psrlq $1,%xmm0
+ psllq $56,%xmm7
+ pxor %xmm0,%xmm6
+ psrlq $7,%xmm0
+ pxor %xmm7,%xmm6
+ psllq $7,%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm3,%xmm0
+ pxor %xmm7,%xmm6
+ movdqa %xmm3,%xmm7
+ psrlq $6,%xmm0
+ paddq %xmm6,%xmm4
+ movdqa %xmm3,%xmm6
+ psrlq $19,%xmm7
+ psllq $3,%xmm6
+ pxor %xmm7,%xmm0
+ psrlq $42,%xmm7
+ pxor %xmm6,%xmm0
+ psllq $42,%xmm6
+ pxor %xmm7,%xmm0
+ movdqa 32(%edx),%xmm7
+ pxor %xmm6,%xmm0
+ movdqa 64(%ebp),%xmm6
+ movq %mm4,%mm1
+ paddq %xmm0,%xmm4
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm4,%xmm6
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm6,-64(%edx)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm6
+.byte 102,15,58,15,253,8
+ movdqa %xmm1,16(%edx)
+.byte 102,15,58,15,193,8
+ movdqa %xmm7,%xmm1
+ psrlq $7,%xmm7
+ paddq %xmm0,%xmm5
+ movdqa %xmm1,%xmm0
+ psrlq $1,%xmm1
+ psllq $56,%xmm0
+ pxor %xmm1,%xmm7
+ psrlq $7,%xmm1
+ pxor %xmm0,%xmm7
+ psllq $7,%xmm0
+ pxor %xmm1,%xmm7
+ movdqa %xmm4,%xmm1
+ pxor %xmm0,%xmm7
+ movdqa %xmm4,%xmm0
+ psrlq $6,%xmm1
+ paddq %xmm7,%xmm5
+ movdqa %xmm4,%xmm7
+ psrlq $19,%xmm0
+ psllq $3,%xmm7
+ pxor %xmm0,%xmm1
+ psrlq $42,%xmm0
+ pxor %xmm7,%xmm1
+ psllq $42,%xmm7
+ pxor %xmm0,%xmm1
+ movdqa 48(%edx),%xmm0
+ pxor %xmm7,%xmm1
+ movdqa 80(%ebp),%xmm7
+ movq %mm4,%mm1
+ paddq %xmm1,%xmm5
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm5,%xmm7
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm7,-48(%edx)
+ movdqa %xmm3,%xmm1
+ movdqa %xmm0,%xmm7
+.byte 102,15,58,15,198,8
+ movdqa %xmm2,32(%edx)
+.byte 102,15,58,15,202,8
+ movdqa %xmm0,%xmm2
+ psrlq $7,%xmm0
+ paddq %xmm1,%xmm6
+ movdqa %xmm2,%xmm1
+ psrlq $1,%xmm2
+ psllq $56,%xmm1
+ pxor %xmm2,%xmm0
+ psrlq $7,%xmm2
+ pxor %xmm1,%xmm0
+ psllq $7,%xmm1
+ pxor %xmm2,%xmm0
+ movdqa %xmm5,%xmm2
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm1
+ psrlq $6,%xmm2
+ paddq %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $19,%xmm1
+ psllq $3,%xmm0
+ pxor %xmm1,%xmm2
+ psrlq $42,%xmm1
+ pxor %xmm0,%xmm2
+ psllq $42,%xmm0
+ pxor %xmm1,%xmm2
+ movdqa (%edx),%xmm1
+ pxor %xmm0,%xmm2
+ movdqa 96(%ebp),%xmm0
+ movq %mm4,%mm1
+ paddq %xmm2,%xmm6
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm6,%xmm0
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm0,-32(%edx)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm1,%xmm0
+.byte 102,15,58,15,207,8
+ movdqa %xmm3,48(%edx)
+.byte 102,15,58,15,211,8
+ movdqa %xmm1,%xmm3
+ psrlq $7,%xmm1
+ paddq %xmm2,%xmm7
+ movdqa %xmm3,%xmm2
+ psrlq $1,%xmm3
+ psllq $56,%xmm2
+ pxor %xmm3,%xmm1
+ psrlq $7,%xmm3
+ pxor %xmm2,%xmm1
+ psllq $7,%xmm2
+ pxor %xmm3,%xmm1
+ movdqa %xmm6,%xmm3
+ pxor %xmm2,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $6,%xmm3
+ paddq %xmm1,%xmm7
+ movdqa %xmm6,%xmm1
+ psrlq $19,%xmm2
+ psllq $3,%xmm1
+ pxor %xmm2,%xmm3
+ psrlq $42,%xmm2
+ pxor %xmm1,%xmm3
+ psllq $42,%xmm1
+ pxor %xmm2,%xmm3
+ movdqa 16(%edx),%xmm2
+ pxor %xmm1,%xmm3
+ movdqa 112(%ebp),%xmm1
+ movq %mm4,%mm1
+ paddq %xmm3,%xmm7
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm7,%xmm1
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm1,-16(%edx)
+ leal 128(%ebp),%ebp
+ decl %ecx
+ jnz L00800_47_ssse3
+ movdqa (%ebp),%xmm1
+ leal -640(%ebp),%ebp
+ movdqu (%ebx),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%ebx),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movq %mm4,%mm1
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%ebx),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movq %mm4,%mm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%ebx),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movq %mm4,%mm1
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%ebx),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movq %mm4,%mm1
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%ebx),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movq %mm4,%mm1
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%ebx),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movq %mm4,%mm1
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%ebx),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movq %mm4,%mm1
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movq %mm4,%mm1
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm2,-16(%edx)
+ movq 8(%esp),%mm1
+ paddq %mm3,%mm0
+ movq 24(%esp),%mm3
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ cmpl %eax,%edi
+ jb L007loop_ssse3
+ movl 76(%edx),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 4,0x90
+L002loop_x86:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ movl 28(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ movl 44(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ movl 60(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 64(%edi),%eax
+ movl 68(%edi),%ebx
+ movl 72(%edi),%ecx
+ movl 76(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 80(%edi),%eax
+ movl 84(%edi),%ebx
+ movl 88(%edi),%ecx
+ movl 92(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 96(%edi),%eax
+ movl 100(%edi),%ebx
+ movl 104(%edi),%ecx
+ movl 108(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 112(%edi),%eax
+ movl 116(%edi),%ebx
+ movl 120(%edi),%ecx
+ movl 124(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ addl $128,%edi
+ subl $72,%esp
+ movl %edi,204(%esp)
+ leal 8(%esp),%edi
+ movl $16,%ecx
+.long 2784229001
+.align 4,0x90
+L00900_15_x86:
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $148,%dl
+ jne L00900_15_x86
+.align 4,0x90
+L01016_79_x86:
+ movl 312(%esp),%ecx
+ movl 316(%esp),%edx
+ movl %ecx,%esi
+ shrl $1,%ecx
+ movl %edx,%edi
+ shrl $1,%edx
+ movl %ecx,%eax
+ shll $24,%esi
+ movl %edx,%ebx
+ shll $24,%edi
+ xorl %esi,%ebx
+ shrl $6,%ecx
+ xorl %edi,%eax
+ shrl $6,%edx
+ xorl %ecx,%eax
+ shll $7,%esi
+ xorl %edx,%ebx
+ shll $1,%edi
+ xorl %esi,%ebx
+ shrl $1,%ecx
+ xorl %edi,%eax
+ shrl $1,%edx
+ xorl %ecx,%eax
+ shll $6,%edi
+ xorl %edx,%ebx
+ xorl %edi,%eax
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl 208(%esp),%ecx
+ movl 212(%esp),%edx
+ movl %ecx,%esi
+ shrl $6,%ecx
+ movl %edx,%edi
+ shrl $6,%edx
+ movl %ecx,%eax
+ shll $3,%esi
+ movl %edx,%ebx
+ shll $3,%edi
+ xorl %esi,%eax
+ shrl $13,%ecx
+ xorl %edi,%ebx
+ shrl $13,%edx
+ xorl %ecx,%eax
+ shll $10,%esi
+ xorl %edx,%ebx
+ shll $10,%edi
+ xorl %esi,%ebx
+ shrl $10,%ecx
+ xorl %edi,%eax
+ shrl $10,%edx
+ xorl %ecx,%ebx
+ shll $13,%edi
+ xorl %edx,%eax
+ xorl %edi,%eax
+ movl 320(%esp),%ecx
+ movl 324(%esp),%edx
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ movl 248(%esp),%esi
+ movl 252(%esp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,192(%esp)
+ movl %ebx,196(%esp)
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $23,%dl
+ jne L01016_79_x86
+ movl 840(%esp),%esi
+ movl 844(%esp),%edi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ addl 8(%esp),%eax
+ adcl 12(%esp),%ebx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ addl 16(%esp),%ecx
+ adcl 20(%esp),%edx
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ addl 24(%esp),%eax
+ adcl 28(%esp),%ebx
+ movl %eax,16(%esi)
+ movl %ebx,20(%esi)
+ addl 32(%esp),%ecx
+ adcl 36(%esp),%edx
+ movl %ecx,24(%esi)
+ movl %edx,28(%esi)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ addl 40(%esp),%eax
+ adcl 44(%esp),%ebx
+ movl %eax,32(%esi)
+ movl %ebx,36(%esi)
+ addl 48(%esp),%ecx
+ adcl 52(%esp),%edx
+ movl %ecx,40(%esi)
+ movl %edx,44(%esi)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ addl 56(%esp),%eax
+ adcl 60(%esp),%ebx
+ movl %eax,48(%esi)
+ movl %ebx,52(%esi)
+ addl 64(%esp),%ecx
+ adcl 68(%esp),%edx
+ movl %ecx,56(%esi)
+ movl %edx,60(%esi)
+ addl $840,%esp
+ subl $640,%ebp
+ cmpl 8(%esp),%edi
+ jb L002loop_x86
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 6,0x90
+L001K512:
+.long 3609767458,1116352408
+.long 602891725,1899447441
+.long 3964484399,3049323471
+.long 2173295548,3921009573
+.long 4081628472,961987163
+.long 3053834265,1508970993
+.long 2937671579,2453635748
+.long 3664609560,2870763221
+.long 2734883394,3624381080
+.long 1164996542,310598401
+.long 1323610764,607225278
+.long 3590304994,1426881987
+.long 4068182383,1925078388
+.long 991336113,2162078206
+.long 633803317,2614888103
+.long 3479774868,3248222580
+.long 2666613458,3835390401
+.long 944711139,4022224774
+.long 2341262773,264347078
+.long 2007800933,604807628
+.long 1495990901,770255983
+.long 1856431235,1249150122
+.long 3175218132,1555081692
+.long 2198950837,1996064986
+.long 3999719339,2554220882
+.long 766784016,2821834349
+.long 2566594879,2952996808
+.long 3203337956,3210313671
+.long 1034457026,3336571891
+.long 2466948901,3584528711
+.long 3758326383,113926993
+.long 168717936,338241895
+.long 1188179964,666307205
+.long 1546045734,773529912
+.long 1522805485,1294757372
+.long 2643833823,1396182291
+.long 2343527390,1695183700
+.long 1014477480,1986661051
+.long 1206759142,2177026350
+.long 344077627,2456956037
+.long 1290863460,2730485921
+.long 3158454273,2820302411
+.long 3505952657,3259730800
+.long 106217008,3345764771
+.long 3606008344,3516065817
+.long 1432725776,3600352804
+.long 1467031594,4094571909
+.long 851169720,275423344
+.long 3100823752,430227734
+.long 1363258195,506948616
+.long 3750685593,659060556
+.long 3785050280,883997877
+.long 3318307427,958139571
+.long 3812723403,1322822218
+.long 2003034995,1537002063
+.long 3602036899,1747873779
+.long 1575990012,1955562222
+.long 1125592928,2024104815
+.long 2716904306,2227730452
+.long 442776044,2361852424
+.long 593698344,2428436474
+.long 3733110249,2756734187
+.long 2999351573,3204031479
+.long 3815920427,3329325298
+.long 3928383900,3391569614
+.long 566280711,3515267271
+.long 3454069534,3940187606
+.long 4000239992,4118630271
+.long 1914138554,116418474
+.long 2731055270,174292421
+.long 3203993006,289380356
+.long 320620315,460393269
+.long 587496836,685471733
+.long 1086792851,852142971
+.long 365543100,1017036298
+.long 2618297676,1126000580
+.long 3409855158,1288033470
+.long 4234509866,1501505948
+.long 987167468,1607167915
+.long 1246189591,1816402316
+.long 67438087,66051
+.long 202182159,134810123
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/sha512-586-linux.S b/gen/bcm/sha512-586-linux.S
new file mode 100644
index 0000000..bb2884d
--- /dev/null
+++ b/gen/bcm/sha512-586-linux.S
@@ -0,0 +1,2835 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl sha512_block_data_order
+.hidden sha512_block_data_order
+.type sha512_block_data_order,@function
+.align 16
+sha512_block_data_order:
+.L_sha512_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal .L001K512-.L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $7,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
+ movl (%edx),%ecx
+ testl $67108864,%ecx
+ jz .L002loop_x86
+ movl 4(%edx),%edx
+ movq (%esi),%mm0
+ andl $16777216,%ecx
+ movq 8(%esi),%mm1
+ andl $512,%edx
+ movq 16(%esi),%mm2
+ orl %edx,%ecx
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ cmpl $16777728,%ecx
+ je .L003SSSE3
+ subl $80,%esp
+ jmp .L004loop_sse2
+.align 16
+.L004loop_sse2:
+ movq %mm1,8(%esp)
+ movq %mm2,16(%esp)
+ movq %mm3,24(%esp)
+ movq %mm5,40(%esp)
+ movq %mm6,48(%esp)
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ movq %mm0,%mm3
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ movl $15,%edx
+ bswap %eax
+ bswap %ebx
+ jmp .L00500_14_sse2
+.align 16
+.L00500_14_sse2:
+ movd %eax,%mm1
+ movl (%edi),%eax
+ movd %ebx,%mm7
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ bswap %eax
+ bswap %ebx
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ movq 48(%esp),%mm6
+ decl %edx
+ jnz .L00500_14_sse2
+ movd %eax,%mm1
+ movd %ebx,%mm7
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ pxor %mm0,%mm0
+ movl $32,%edx
+ jmp .L00616_79_sse2
+.align 16
+.L00616_79_sse2:
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm0
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm2
+ addl $8,%ebp
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm2
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm0
+ addl $8,%ebp
+ decl %edx
+ jnz .L00616_79_sse2
+ paddq %mm3,%mm0
+ movq 8(%esp),%mm1
+ movq 24(%esp),%mm3
+ movq 40(%esp),%mm5
+ movq 48(%esp),%mm6
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movl $640,%eax
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ leal (%esp,%eax,1),%esp
+ subl %eax,%ebp
+ cmpl 88(%esp),%edi
+ jb .L004loop_sse2
+ movl 92(%esp),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L003SSSE3:
+ leal -64(%esp),%edx
+ subl $256,%esp
+ movdqa 640(%ebp),%xmm1
+ movdqu (%edi),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%edi),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%edi),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%edi),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%edi),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%edi),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%edi),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%edi),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movdqa %xmm2,-16(%edx)
+ nop
+.align 32
+.L007loop_ssse3:
+ movdqa 16(%edx),%xmm2
+ movdqa %xmm3,48(%edx)
+ leal 128(%ebp),%ebp
+ movq %mm1,8(%esp)
+ movl %edi,%ebx
+ movq %mm2,16(%esp)
+ leal 128(%edi),%edi
+ movq %mm3,24(%esp)
+ cmpl %eax,%edi
+ movq %mm5,40(%esp)
+ cmovbl %edi,%ebx
+ movq %mm6,48(%esp)
+ movl $4,%ecx
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ pxor %mm3,%mm3
+ jmp .L00800_47_ssse3
+.align 32
+.L00800_47_ssse3:
+ movdqa %xmm5,%xmm3
+ movdqa %xmm2,%xmm1
+.byte 102,15,58,15,208,8
+ movdqa %xmm4,(%edx)
+.byte 102,15,58,15,220,8
+ movdqa %xmm2,%xmm4
+ psrlq $7,%xmm2
+ paddq %xmm3,%xmm0
+ movdqa %xmm4,%xmm3
+ psrlq $1,%xmm4
+ psllq $56,%xmm3
+ pxor %xmm4,%xmm2
+ psrlq $7,%xmm4
+ pxor %xmm3,%xmm2
+ psllq $7,%xmm3
+ pxor %xmm4,%xmm2
+ movdqa %xmm7,%xmm4
+ pxor %xmm3,%xmm2
+ movdqa %xmm7,%xmm3
+ psrlq $6,%xmm4
+ paddq %xmm2,%xmm0
+ movdqa %xmm7,%xmm2
+ psrlq $19,%xmm3
+ psllq $3,%xmm2
+ pxor %xmm3,%xmm4
+ psrlq $42,%xmm3
+ pxor %xmm2,%xmm4
+ psllq $42,%xmm2
+ pxor %xmm3,%xmm4
+ movdqa 32(%edx),%xmm3
+ pxor %xmm2,%xmm4
+ movdqa (%ebp),%xmm2
+ movq %mm4,%mm1
+ paddq %xmm4,%xmm0
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm0,%xmm2
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm2,-128(%edx)
+ movdqa %xmm6,%xmm4
+ movdqa %xmm3,%xmm2
+.byte 102,15,58,15,217,8
+ movdqa %xmm5,16(%edx)
+.byte 102,15,58,15,229,8
+ movdqa %xmm3,%xmm5
+ psrlq $7,%xmm3
+ paddq %xmm4,%xmm1
+ movdqa %xmm5,%xmm4
+ psrlq $1,%xmm5
+ psllq $56,%xmm4
+ pxor %xmm5,%xmm3
+ psrlq $7,%xmm5
+ pxor %xmm4,%xmm3
+ psllq $7,%xmm4
+ pxor %xmm5,%xmm3
+ movdqa %xmm0,%xmm5
+ pxor %xmm4,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $6,%xmm5
+ paddq %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psrlq $19,%xmm4
+ psllq $3,%xmm3
+ pxor %xmm4,%xmm5
+ psrlq $42,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $42,%xmm3
+ pxor %xmm4,%xmm5
+ movdqa 48(%edx),%xmm4
+ pxor %xmm3,%xmm5
+ movdqa 16(%ebp),%xmm3
+ movq %mm4,%mm1
+ paddq %xmm5,%xmm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm1,%xmm3
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm3,-112(%edx)
+ movdqa %xmm7,%xmm5
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,226,8
+ movdqa %xmm6,32(%edx)
+.byte 102,15,58,15,238,8
+ movdqa %xmm4,%xmm6
+ psrlq $7,%xmm4
+ paddq %xmm5,%xmm2
+ movdqa %xmm6,%xmm5
+ psrlq $1,%xmm6
+ psllq $56,%xmm5
+ pxor %xmm6,%xmm4
+ psrlq $7,%xmm6
+ pxor %xmm5,%xmm4
+ psllq $7,%xmm5
+ pxor %xmm6,%xmm4
+ movdqa %xmm1,%xmm6
+ pxor %xmm5,%xmm4
+ movdqa %xmm1,%xmm5
+ psrlq $6,%xmm6
+ paddq %xmm4,%xmm2
+ movdqa %xmm1,%xmm4
+ psrlq $19,%xmm5
+ psllq $3,%xmm4
+ pxor %xmm5,%xmm6
+ psrlq $42,%xmm5
+ pxor %xmm4,%xmm6
+ psllq $42,%xmm4
+ pxor %xmm5,%xmm6
+ movdqa (%edx),%xmm5
+ pxor %xmm4,%xmm6
+ movdqa 32(%ebp),%xmm4
+ movq %mm4,%mm1
+ paddq %xmm6,%xmm2
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm2,%xmm4
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm4,-96(%edx)
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm4
+.byte 102,15,58,15,235,8
+ movdqa %xmm7,48(%edx)
+.byte 102,15,58,15,247,8
+ movdqa %xmm5,%xmm7
+ psrlq $7,%xmm5
+ paddq %xmm6,%xmm3
+ movdqa %xmm7,%xmm6
+ psrlq $1,%xmm7
+ psllq $56,%xmm6
+ pxor %xmm7,%xmm5
+ psrlq $7,%xmm7
+ pxor %xmm6,%xmm5
+ psllq $7,%xmm6
+ pxor %xmm7,%xmm5
+ movdqa %xmm2,%xmm7
+ pxor %xmm6,%xmm5
+ movdqa %xmm2,%xmm6
+ psrlq $6,%xmm7
+ paddq %xmm5,%xmm3
+ movdqa %xmm2,%xmm5
+ psrlq $19,%xmm6
+ psllq $3,%xmm5
+ pxor %xmm6,%xmm7
+ psrlq $42,%xmm6
+ pxor %xmm5,%xmm7
+ psllq $42,%xmm5
+ pxor %xmm6,%xmm7
+ movdqa 16(%edx),%xmm6
+ pxor %xmm5,%xmm7
+ movdqa 48(%ebp),%xmm5
+ movq %mm4,%mm1
+ paddq %xmm7,%xmm3
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm3,%xmm5
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm5,-80(%edx)
+ movdqa %xmm1,%xmm7
+ movdqa %xmm6,%xmm5
+.byte 102,15,58,15,244,8
+ movdqa %xmm0,(%edx)
+.byte 102,15,58,15,248,8
+ movdqa %xmm6,%xmm0
+ psrlq $7,%xmm6
+ paddq %xmm7,%xmm4
+ movdqa %xmm0,%xmm7
+ psrlq $1,%xmm0
+ psllq $56,%xmm7
+ pxor %xmm0,%xmm6
+ psrlq $7,%xmm0
+ pxor %xmm7,%xmm6
+ psllq $7,%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm3,%xmm0
+ pxor %xmm7,%xmm6
+ movdqa %xmm3,%xmm7
+ psrlq $6,%xmm0
+ paddq %xmm6,%xmm4
+ movdqa %xmm3,%xmm6
+ psrlq $19,%xmm7
+ psllq $3,%xmm6
+ pxor %xmm7,%xmm0
+ psrlq $42,%xmm7
+ pxor %xmm6,%xmm0
+ psllq $42,%xmm6
+ pxor %xmm7,%xmm0
+ movdqa 32(%edx),%xmm7
+ pxor %xmm6,%xmm0
+ movdqa 64(%ebp),%xmm6
+ movq %mm4,%mm1
+ paddq %xmm0,%xmm4
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm4,%xmm6
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm6,-64(%edx)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm6
+.byte 102,15,58,15,253,8
+ movdqa %xmm1,16(%edx)
+.byte 102,15,58,15,193,8
+ movdqa %xmm7,%xmm1
+ psrlq $7,%xmm7
+ paddq %xmm0,%xmm5
+ movdqa %xmm1,%xmm0
+ psrlq $1,%xmm1
+ psllq $56,%xmm0
+ pxor %xmm1,%xmm7
+ psrlq $7,%xmm1
+ pxor %xmm0,%xmm7
+ psllq $7,%xmm0
+ pxor %xmm1,%xmm7
+ movdqa %xmm4,%xmm1
+ pxor %xmm0,%xmm7
+ movdqa %xmm4,%xmm0
+ psrlq $6,%xmm1
+ paddq %xmm7,%xmm5
+ movdqa %xmm4,%xmm7
+ psrlq $19,%xmm0
+ psllq $3,%xmm7
+ pxor %xmm0,%xmm1
+ psrlq $42,%xmm0
+ pxor %xmm7,%xmm1
+ psllq $42,%xmm7
+ pxor %xmm0,%xmm1
+ movdqa 48(%edx),%xmm0
+ pxor %xmm7,%xmm1
+ movdqa 80(%ebp),%xmm7
+ movq %mm4,%mm1
+ paddq %xmm1,%xmm5
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm5,%xmm7
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm7,-48(%edx)
+ movdqa %xmm3,%xmm1
+ movdqa %xmm0,%xmm7
+.byte 102,15,58,15,198,8
+ movdqa %xmm2,32(%edx)
+.byte 102,15,58,15,202,8
+ movdqa %xmm0,%xmm2
+ psrlq $7,%xmm0
+ paddq %xmm1,%xmm6
+ movdqa %xmm2,%xmm1
+ psrlq $1,%xmm2
+ psllq $56,%xmm1
+ pxor %xmm2,%xmm0
+ psrlq $7,%xmm2
+ pxor %xmm1,%xmm0
+ psllq $7,%xmm1
+ pxor %xmm2,%xmm0
+ movdqa %xmm5,%xmm2
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm1
+ psrlq $6,%xmm2
+ paddq %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $19,%xmm1
+ psllq $3,%xmm0
+ pxor %xmm1,%xmm2
+ psrlq $42,%xmm1
+ pxor %xmm0,%xmm2
+ psllq $42,%xmm0
+ pxor %xmm1,%xmm2
+ movdqa (%edx),%xmm1
+ pxor %xmm0,%xmm2
+ movdqa 96(%ebp),%xmm0
+ movq %mm4,%mm1
+ paddq %xmm2,%xmm6
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm6,%xmm0
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm0,-32(%edx)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm1,%xmm0
+.byte 102,15,58,15,207,8
+ movdqa %xmm3,48(%edx)
+.byte 102,15,58,15,211,8
+ movdqa %xmm1,%xmm3
+ psrlq $7,%xmm1
+ paddq %xmm2,%xmm7
+ movdqa %xmm3,%xmm2
+ psrlq $1,%xmm3
+ psllq $56,%xmm2
+ pxor %xmm3,%xmm1
+ psrlq $7,%xmm3
+ pxor %xmm2,%xmm1
+ psllq $7,%xmm2
+ pxor %xmm3,%xmm1
+ movdqa %xmm6,%xmm3
+ pxor %xmm2,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $6,%xmm3
+ paddq %xmm1,%xmm7
+ movdqa %xmm6,%xmm1
+ psrlq $19,%xmm2
+ psllq $3,%xmm1
+ pxor %xmm2,%xmm3
+ psrlq $42,%xmm2
+ pxor %xmm1,%xmm3
+ psllq $42,%xmm1
+ pxor %xmm2,%xmm3
+ movdqa 16(%edx),%xmm2
+ pxor %xmm1,%xmm3
+ movdqa 112(%ebp),%xmm1
+ movq %mm4,%mm1
+ paddq %xmm3,%xmm7
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm7,%xmm1
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm1,-16(%edx)
+ leal 128(%ebp),%ebp
+ decl %ecx
+ jnz .L00800_47_ssse3
+ movdqa (%ebp),%xmm1
+ leal -640(%ebp),%ebp
+ movdqu (%ebx),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%ebx),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movq %mm4,%mm1
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%ebx),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movq %mm4,%mm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%ebx),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movq %mm4,%mm1
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%ebx),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movq %mm4,%mm1
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%ebx),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movq %mm4,%mm1
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%ebx),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movq %mm4,%mm1
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%ebx),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movq %mm4,%mm1
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movq %mm4,%mm1
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm2,-16(%edx)
+ movq 8(%esp),%mm1
+ paddq %mm3,%mm0
+ movq 24(%esp),%mm3
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ cmpl %eax,%edi
+ jb .L007loop_ssse3
+ movl 76(%edx),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 16
+.L002loop_x86:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ movl 28(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ movl 44(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ movl 60(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 64(%edi),%eax
+ movl 68(%edi),%ebx
+ movl 72(%edi),%ecx
+ movl 76(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 80(%edi),%eax
+ movl 84(%edi),%ebx
+ movl 88(%edi),%ecx
+ movl 92(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 96(%edi),%eax
+ movl 100(%edi),%ebx
+ movl 104(%edi),%ecx
+ movl 108(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 112(%edi),%eax
+ movl 116(%edi),%ebx
+ movl 120(%edi),%ecx
+ movl 124(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ addl $128,%edi
+ subl $72,%esp
+ movl %edi,204(%esp)
+ leal 8(%esp),%edi
+ movl $16,%ecx
+.long 2784229001
+.align 16
+.L00900_15_x86:
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $148,%dl
+ jne .L00900_15_x86
+.align 16
+.L01016_79_x86:
+ movl 312(%esp),%ecx
+ movl 316(%esp),%edx
+ movl %ecx,%esi
+ shrl $1,%ecx
+ movl %edx,%edi
+ shrl $1,%edx
+ movl %ecx,%eax
+ shll $24,%esi
+ movl %edx,%ebx
+ shll $24,%edi
+ xorl %esi,%ebx
+ shrl $6,%ecx
+ xorl %edi,%eax
+ shrl $6,%edx
+ xorl %ecx,%eax
+ shll $7,%esi
+ xorl %edx,%ebx
+ shll $1,%edi
+ xorl %esi,%ebx
+ shrl $1,%ecx
+ xorl %edi,%eax
+ shrl $1,%edx
+ xorl %ecx,%eax
+ shll $6,%edi
+ xorl %edx,%ebx
+ xorl %edi,%eax
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl 208(%esp),%ecx
+ movl 212(%esp),%edx
+ movl %ecx,%esi
+ shrl $6,%ecx
+ movl %edx,%edi
+ shrl $6,%edx
+ movl %ecx,%eax
+ shll $3,%esi
+ movl %edx,%ebx
+ shll $3,%edi
+ xorl %esi,%eax
+ shrl $13,%ecx
+ xorl %edi,%ebx
+ shrl $13,%edx
+ xorl %ecx,%eax
+ shll $10,%esi
+ xorl %edx,%ebx
+ shll $10,%edi
+ xorl %esi,%ebx
+ shrl $10,%ecx
+ xorl %edi,%eax
+ shrl $10,%edx
+ xorl %ecx,%ebx
+ shll $13,%edi
+ xorl %edx,%eax
+ xorl %edi,%eax
+ movl 320(%esp),%ecx
+ movl 324(%esp),%edx
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ movl 248(%esp),%esi
+ movl 252(%esp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,192(%esp)
+ movl %ebx,196(%esp)
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $23,%dl
+ jne .L01016_79_x86
+ movl 840(%esp),%esi
+ movl 844(%esp),%edi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ addl 8(%esp),%eax
+ adcl 12(%esp),%ebx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ addl 16(%esp),%ecx
+ adcl 20(%esp),%edx
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ addl 24(%esp),%eax
+ adcl 28(%esp),%ebx
+ movl %eax,16(%esi)
+ movl %ebx,20(%esi)
+ addl 32(%esp),%ecx
+ adcl 36(%esp),%edx
+ movl %ecx,24(%esi)
+ movl %edx,28(%esi)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ addl 40(%esp),%eax
+ adcl 44(%esp),%ebx
+ movl %eax,32(%esi)
+ movl %ebx,36(%esi)
+ addl 48(%esp),%ecx
+ adcl 52(%esp),%edx
+ movl %ecx,40(%esi)
+ movl %edx,44(%esi)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ addl 56(%esp),%eax
+ adcl 60(%esp),%ebx
+ movl %eax,48(%esi)
+ movl %ebx,52(%esi)
+ addl 64(%esp),%ecx
+ adcl 68(%esp),%edx
+ movl %ecx,56(%esi)
+ movl %edx,60(%esi)
+ addl $840,%esp
+ subl $640,%ebp
+ cmpl 8(%esp),%edi
+ jb .L002loop_x86
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L001K512:
+.long 3609767458,1116352408
+.long 602891725,1899447441
+.long 3964484399,3049323471
+.long 2173295548,3921009573
+.long 4081628472,961987163
+.long 3053834265,1508970993
+.long 2937671579,2453635748
+.long 3664609560,2870763221
+.long 2734883394,3624381080
+.long 1164996542,310598401
+.long 1323610764,607225278
+.long 3590304994,1426881987
+.long 4068182383,1925078388
+.long 991336113,2162078206
+.long 633803317,2614888103
+.long 3479774868,3248222580
+.long 2666613458,3835390401
+.long 944711139,4022224774
+.long 2341262773,264347078
+.long 2007800933,604807628
+.long 1495990901,770255983
+.long 1856431235,1249150122
+.long 3175218132,1555081692
+.long 2198950837,1996064986
+.long 3999719339,2554220882
+.long 766784016,2821834349
+.long 2566594879,2952996808
+.long 3203337956,3210313671
+.long 1034457026,3336571891
+.long 2466948901,3584528711
+.long 3758326383,113926993
+.long 168717936,338241895
+.long 1188179964,666307205
+.long 1546045734,773529912
+.long 1522805485,1294757372
+.long 2643833823,1396182291
+.long 2343527390,1695183700
+.long 1014477480,1986661051
+.long 1206759142,2177026350
+.long 344077627,2456956037
+.long 1290863460,2730485921
+.long 3158454273,2820302411
+.long 3505952657,3259730800
+.long 106217008,3345764771
+.long 3606008344,3516065817
+.long 1432725776,3600352804
+.long 1467031594,4094571909
+.long 851169720,275423344
+.long 3100823752,430227734
+.long 1363258195,506948616
+.long 3750685593,659060556
+.long 3785050280,883997877
+.long 3318307427,958139571
+.long 3812723403,1322822218
+.long 2003034995,1537002063
+.long 3602036899,1747873779
+.long 1575990012,1955562222
+.long 1125592928,2024104815
+.long 2716904306,2227730452
+.long 442776044,2361852424
+.long 593698344,2428436474
+.long 3733110249,2756734187
+.long 2999351573,3204031479
+.long 3815920427,3329325298
+.long 3928383900,3391569614
+.long 566280711,3515267271
+.long 3454069534,3940187606
+.long 4000239992,4118630271
+.long 1914138554,116418474
+.long 2731055270,174292421
+.long 3203993006,289380356
+.long 320620315,460393269
+.long 587496836,685471733
+.long 1086792851,852142971
+.long 365543100,1017036298
+.long 2618297676,1126000580
+.long 3409855158,1288033470
+.long 4234509866,1501505948
+.long 987167468,1607167915
+.long 1246189591,1816402316
+.long 67438087,66051
+.long 202182159,134810123
+.size sha512_block_data_order,.-.L_sha512_block_data_order_begin
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/sha512-586-win.asm b/gen/bcm/sha512-586-win.asm
new file mode 100644
index 0000000..3603a6d
--- /dev/null
+++ b/gen/bcm/sha512-586-win.asm
@@ -0,0 +1,2846 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+;extern _OPENSSL_ia32cap_P
+global _sha512_block_data_order
+align 16
+_sha512_block_data_order:
+L$_sha512_block_data_order_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov ebx,esp
+ call L$000pic_point
+L$000pic_point:
+ pop ebp
+ lea ebp,[(L$001K512-L$000pic_point)+ebp]
+ sub esp,16
+ and esp,-64
+ shl eax,7
+ add eax,edi
+ mov DWORD [esp],esi
+ mov DWORD [4+esp],edi
+ mov DWORD [8+esp],eax
+ mov DWORD [12+esp],ebx
+ lea edx,[_OPENSSL_ia32cap_P]
+ mov ecx,DWORD [edx]
+ test ecx,67108864
+ jz NEAR L$002loop_x86
+ mov edx,DWORD [4+edx]
+ movq mm0,[esi]
+ and ecx,16777216
+ movq mm1,[8+esi]
+ and edx,512
+ movq mm2,[16+esi]
+ or ecx,edx
+ movq mm3,[24+esi]
+ movq mm4,[32+esi]
+ movq mm5,[40+esi]
+ movq mm6,[48+esi]
+ movq mm7,[56+esi]
+ cmp ecx,16777728
+ je NEAR L$003SSSE3
+ sub esp,80
+ jmp NEAR L$004loop_sse2
+align 16
+L$004loop_sse2:
+ movq [8+esp],mm1
+ movq [16+esp],mm2
+ movq [24+esp],mm3
+ movq [40+esp],mm5
+ movq [48+esp],mm6
+ pxor mm2,mm1
+ movq [56+esp],mm7
+ movq mm3,mm0
+ mov eax,DWORD [edi]
+ mov ebx,DWORD [4+edi]
+ add edi,8
+ mov edx,15
+ bswap eax
+ bswap ebx
+ jmp NEAR L$00500_14_sse2
+align 16
+L$00500_14_sse2:
+ movd mm1,eax
+ mov eax,DWORD [edi]
+ movd mm7,ebx
+ mov ebx,DWORD [4+edi]
+ add edi,8
+ bswap eax
+ bswap ebx
+ punpckldq mm7,mm1
+ movq mm1,mm4
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ movq mm0,mm3
+ movq [72+esp],mm7
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ paddq mm7,[ebp]
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ sub esp,8
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[40+esp]
+ paddq mm3,mm2
+ movq mm2,mm0
+ add ebp,8
+ paddq mm3,mm6
+ movq mm6,[48+esp]
+ dec edx
+ jnz NEAR L$00500_14_sse2
+ movd mm1,eax
+ movd mm7,ebx
+ punpckldq mm7,mm1
+ movq mm1,mm4
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ movq mm0,mm3
+ movq [72+esp],mm7
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ paddq mm7,[ebp]
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ sub esp,8
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm7,[192+esp]
+ paddq mm3,mm2
+ movq mm2,mm0
+ add ebp,8
+ paddq mm3,mm6
+ pxor mm0,mm0
+ mov edx,32
+ jmp NEAR L$00616_79_sse2
+align 16
+L$00616_79_sse2:
+ movq mm5,[88+esp]
+ movq mm1,mm7
+ psrlq mm7,1
+ movq mm6,mm5
+ psrlq mm5,6
+ psllq mm1,56
+ paddq mm0,mm3
+ movq mm3,mm7
+ psrlq mm7,6
+ pxor mm3,mm1
+ psllq mm1,7
+ pxor mm3,mm7
+ psrlq mm7,1
+ pxor mm3,mm1
+ movq mm1,mm5
+ psrlq mm5,13
+ pxor mm7,mm3
+ psllq mm6,3
+ pxor mm1,mm5
+ paddq mm7,[200+esp]
+ pxor mm1,mm6
+ psrlq mm5,42
+ paddq mm7,[128+esp]
+ pxor mm1,mm5
+ psllq mm6,42
+ movq mm5,[40+esp]
+ pxor mm1,mm6
+ movq mm6,[48+esp]
+ paddq mm7,mm1
+ movq mm1,mm4
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ movq [72+esp],mm7
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ paddq mm7,[ebp]
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ sub esp,8
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm7,[192+esp]
+ paddq mm2,mm6
+ add ebp,8
+ movq mm5,[88+esp]
+ movq mm1,mm7
+ psrlq mm7,1
+ movq mm6,mm5
+ psrlq mm5,6
+ psllq mm1,56
+ paddq mm2,mm3
+ movq mm3,mm7
+ psrlq mm7,6
+ pxor mm3,mm1
+ psllq mm1,7
+ pxor mm3,mm7
+ psrlq mm7,1
+ pxor mm3,mm1
+ movq mm1,mm5
+ psrlq mm5,13
+ pxor mm7,mm3
+ psllq mm6,3
+ pxor mm1,mm5
+ paddq mm7,[200+esp]
+ pxor mm1,mm6
+ psrlq mm5,42
+ paddq mm7,[128+esp]
+ pxor mm1,mm5
+ psllq mm6,42
+ movq mm5,[40+esp]
+ pxor mm1,mm6
+ movq mm6,[48+esp]
+ paddq mm7,mm1
+ movq mm1,mm4
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ movq [72+esp],mm7
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ paddq mm7,[ebp]
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ sub esp,8
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm7,[192+esp]
+ paddq mm0,mm6
+ add ebp,8
+ dec edx
+ jnz NEAR L$00616_79_sse2
+ paddq mm0,mm3
+ movq mm1,[8+esp]
+ movq mm3,[24+esp]
+ movq mm5,[40+esp]
+ movq mm6,[48+esp]
+ movq mm7,[56+esp]
+ pxor mm2,mm1
+ paddq mm0,[esi]
+ paddq mm1,[8+esi]
+ paddq mm2,[16+esi]
+ paddq mm3,[24+esi]
+ paddq mm4,[32+esi]
+ paddq mm5,[40+esi]
+ paddq mm6,[48+esi]
+ paddq mm7,[56+esi]
+ mov eax,640
+ movq [esi],mm0
+ movq [8+esi],mm1
+ movq [16+esi],mm2
+ movq [24+esi],mm3
+ movq [32+esi],mm4
+ movq [40+esi],mm5
+ movq [48+esi],mm6
+ movq [56+esi],mm7
+ lea esp,[eax*1+esp]
+ sub ebp,eax
+ cmp edi,DWORD [88+esp]
+ jb NEAR L$004loop_sse2
+ mov esp,DWORD [92+esp]
+ emms
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 32
+L$003SSSE3:
+ lea edx,[esp-64]
+ sub esp,256
+ movdqa xmm1,[640+ebp]
+ movdqu xmm0,[edi]
+db 102,15,56,0,193
+ movdqa xmm3,[ebp]
+ movdqa xmm2,xmm1
+ movdqu xmm1,[16+edi]
+ paddq xmm3,xmm0
+db 102,15,56,0,202
+ movdqa [edx-128],xmm3
+ movdqa xmm4,[16+ebp]
+ movdqa xmm3,xmm2
+ movdqu xmm2,[32+edi]
+ paddq xmm4,xmm1
+db 102,15,56,0,211
+ movdqa [edx-112],xmm4
+ movdqa xmm5,[32+ebp]
+ movdqa xmm4,xmm3
+ movdqu xmm3,[48+edi]
+ paddq xmm5,xmm2
+db 102,15,56,0,220
+ movdqa [edx-96],xmm5
+ movdqa xmm6,[48+ebp]
+ movdqa xmm5,xmm4
+ movdqu xmm4,[64+edi]
+ paddq xmm6,xmm3
+db 102,15,56,0,229
+ movdqa [edx-80],xmm6
+ movdqa xmm7,[64+ebp]
+ movdqa xmm6,xmm5
+ movdqu xmm5,[80+edi]
+ paddq xmm7,xmm4
+db 102,15,56,0,238
+ movdqa [edx-64],xmm7
+ movdqa [edx],xmm0
+ movdqa xmm0,[80+ebp]
+ movdqa xmm7,xmm6
+ movdqu xmm6,[96+edi]
+ paddq xmm0,xmm5
+db 102,15,56,0,247
+ movdqa [edx-48],xmm0
+ movdqa [16+edx],xmm1
+ movdqa xmm1,[96+ebp]
+ movdqa xmm0,xmm7
+ movdqu xmm7,[112+edi]
+ paddq xmm1,xmm6
+db 102,15,56,0,248
+ movdqa [edx-32],xmm1
+ movdqa [32+edx],xmm2
+ movdqa xmm2,[112+ebp]
+ movdqa xmm0,[edx]
+ paddq xmm2,xmm7
+ movdqa [edx-16],xmm2
+ nop
+align 32
+L$007loop_ssse3:
+ movdqa xmm2,[16+edx]
+ movdqa [48+edx],xmm3
+ lea ebp,[128+ebp]
+ movq [8+esp],mm1
+ mov ebx,edi
+ movq [16+esp],mm2
+ lea edi,[128+edi]
+ movq [24+esp],mm3
+ cmp edi,eax
+ movq [40+esp],mm5
+ cmovb ebx,edi
+ movq [48+esp],mm6
+ mov ecx,4
+ pxor mm2,mm1
+ movq [56+esp],mm7
+ pxor mm3,mm3
+ jmp NEAR L$00800_47_ssse3
+align 32
+L$00800_47_ssse3:
+ movdqa xmm3,xmm5
+ movdqa xmm1,xmm2
+db 102,15,58,15,208,8
+ movdqa [edx],xmm4
+db 102,15,58,15,220,8
+ movdqa xmm4,xmm2
+ psrlq xmm2,7
+ paddq xmm0,xmm3
+ movdqa xmm3,xmm4
+ psrlq xmm4,1
+ psllq xmm3,56
+ pxor xmm2,xmm4
+ psrlq xmm4,7
+ pxor xmm2,xmm3
+ psllq xmm3,7
+ pxor xmm2,xmm4
+ movdqa xmm4,xmm7
+ pxor xmm2,xmm3
+ movdqa xmm3,xmm7
+ psrlq xmm4,6
+ paddq xmm0,xmm2
+ movdqa xmm2,xmm7
+ psrlq xmm3,19
+ psllq xmm2,3
+ pxor xmm4,xmm3
+ psrlq xmm3,42
+ pxor xmm4,xmm2
+ psllq xmm2,42
+ pxor xmm4,xmm3
+ movdqa xmm3,[32+edx]
+ pxor xmm4,xmm2
+ movdqa xmm2,[ebp]
+ movq mm1,mm4
+ paddq xmm0,xmm4
+ movq mm7,[edx-128]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ paddq xmm2,xmm0
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[32+esp]
+ paddq mm2,mm6
+ movq mm6,[40+esp]
+ movq mm1,mm4
+ movq mm7,[edx-120]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [24+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [56+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[48+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[16+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[24+esp]
+ paddq mm0,mm6
+ movq mm6,[32+esp]
+ movdqa [edx-128],xmm2
+ movdqa xmm4,xmm6
+ movdqa xmm2,xmm3
+db 102,15,58,15,217,8
+ movdqa [16+edx],xmm5
+db 102,15,58,15,229,8
+ movdqa xmm5,xmm3
+ psrlq xmm3,7
+ paddq xmm1,xmm4
+ movdqa xmm4,xmm5
+ psrlq xmm5,1
+ psllq xmm4,56
+ pxor xmm3,xmm5
+ psrlq xmm5,7
+ pxor xmm3,xmm4
+ psllq xmm4,7
+ pxor xmm3,xmm5
+ movdqa xmm5,xmm0
+ pxor xmm3,xmm4
+ movdqa xmm4,xmm0
+ psrlq xmm5,6
+ paddq xmm1,xmm3
+ movdqa xmm3,xmm0
+ psrlq xmm4,19
+ psllq xmm3,3
+ pxor xmm5,xmm4
+ psrlq xmm4,42
+ pxor xmm5,xmm3
+ psllq xmm3,42
+ pxor xmm5,xmm4
+ movdqa xmm4,[48+edx]
+ pxor xmm5,xmm3
+ movdqa xmm3,[16+ebp]
+ movq mm1,mm4
+ paddq xmm1,xmm5
+ movq mm7,[edx-112]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [16+esp],mm4
+ paddq xmm3,xmm1
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [48+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[40+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[8+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[56+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[16+esp]
+ paddq mm2,mm6
+ movq mm6,[24+esp]
+ movq mm1,mm4
+ movq mm7,[edx-104]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [8+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [40+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[32+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[48+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[8+esp]
+ paddq mm0,mm6
+ movq mm6,[16+esp]
+ movdqa [edx-112],xmm3
+ movdqa xmm5,xmm7
+ movdqa xmm3,xmm4
+db 102,15,58,15,226,8
+ movdqa [32+edx],xmm6
+db 102,15,58,15,238,8
+ movdqa xmm6,xmm4
+ psrlq xmm4,7
+ paddq xmm2,xmm5
+ movdqa xmm5,xmm6
+ psrlq xmm6,1
+ psllq xmm5,56
+ pxor xmm4,xmm6
+ psrlq xmm6,7
+ pxor xmm4,xmm5
+ psllq xmm5,7
+ pxor xmm4,xmm6
+ movdqa xmm6,xmm1
+ pxor xmm4,xmm5
+ movdqa xmm5,xmm1
+ psrlq xmm6,6
+ paddq xmm2,xmm4
+ movdqa xmm4,xmm1
+ psrlq xmm5,19
+ psllq xmm4,3
+ pxor xmm6,xmm5
+ psrlq xmm5,42
+ pxor xmm6,xmm4
+ psllq xmm4,42
+ pxor xmm6,xmm5
+ movdqa xmm5,[edx]
+ pxor xmm6,xmm4
+ movdqa xmm4,[32+ebp]
+ movq mm1,mm4
+ paddq xmm2,xmm6
+ movq mm7,[edx-96]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [esp],mm4
+ paddq xmm4,xmm2
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [32+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[24+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[56+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[40+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[esp]
+ paddq mm2,mm6
+ movq mm6,[8+esp]
+ movq mm1,mm4
+ movq mm7,[edx-88]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [56+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [24+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[16+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[48+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[32+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[56+esp]
+ paddq mm0,mm6
+ movq mm6,[esp]
+ movdqa [edx-96],xmm4
+ movdqa xmm6,xmm0
+ movdqa xmm4,xmm5
+db 102,15,58,15,235,8
+ movdqa [48+edx],xmm7
+db 102,15,58,15,247,8
+ movdqa xmm7,xmm5
+ psrlq xmm5,7
+ paddq xmm3,xmm6
+ movdqa xmm6,xmm7
+ psrlq xmm7,1
+ psllq xmm6,56
+ pxor xmm5,xmm7
+ psrlq xmm7,7
+ pxor xmm5,xmm6
+ psllq xmm6,7
+ pxor xmm5,xmm7
+ movdqa xmm7,xmm2
+ pxor xmm5,xmm6
+ movdqa xmm6,xmm2
+ psrlq xmm7,6
+ paddq xmm3,xmm5
+ movdqa xmm5,xmm2
+ psrlq xmm6,19
+ psllq xmm5,3
+ pxor xmm7,xmm6
+ psrlq xmm6,42
+ pxor xmm7,xmm5
+ psllq xmm5,42
+ pxor xmm7,xmm6
+ movdqa xmm6,[16+edx]
+ pxor xmm7,xmm5
+ movdqa xmm5,[48+ebp]
+ movq mm1,mm4
+ paddq xmm3,xmm7
+ movq mm7,[edx-80]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [48+esp],mm4
+ paddq xmm5,xmm3
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [16+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[8+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[40+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[24+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[48+esp]
+ paddq mm2,mm6
+ movq mm6,[56+esp]
+ movq mm1,mm4
+ movq mm7,[edx-72]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [40+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [8+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[32+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[16+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[40+esp]
+ paddq mm0,mm6
+ movq mm6,[48+esp]
+ movdqa [edx-80],xmm5
+ movdqa xmm7,xmm1
+ movdqa xmm5,xmm6
+db 102,15,58,15,244,8
+ movdqa [edx],xmm0
+db 102,15,58,15,248,8
+ movdqa xmm0,xmm6
+ psrlq xmm6,7
+ paddq xmm4,xmm7
+ movdqa xmm7,xmm0
+ psrlq xmm0,1
+ psllq xmm7,56
+ pxor xmm6,xmm0
+ psrlq xmm0,7
+ pxor xmm6,xmm7
+ psllq xmm7,7
+ pxor xmm6,xmm0
+ movdqa xmm0,xmm3
+ pxor xmm6,xmm7
+ movdqa xmm7,xmm3
+ psrlq xmm0,6
+ paddq xmm4,xmm6
+ movdqa xmm6,xmm3
+ psrlq xmm7,19
+ psllq xmm6,3
+ pxor xmm0,xmm7
+ psrlq xmm7,42
+ pxor xmm0,xmm6
+ psllq xmm6,42
+ pxor xmm0,xmm7
+ movdqa xmm7,[32+edx]
+ pxor xmm0,xmm6
+ movdqa xmm6,[64+ebp]
+ movq mm1,mm4
+ paddq xmm4,xmm0
+ movq mm7,[edx-64]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ paddq xmm6,xmm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[32+esp]
+ paddq mm2,mm6
+ movq mm6,[40+esp]
+ movq mm1,mm4
+ movq mm7,[edx-56]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [24+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [56+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[48+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[16+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[24+esp]
+ paddq mm0,mm6
+ movq mm6,[32+esp]
+ movdqa [edx-64],xmm6
+ movdqa xmm0,xmm2
+ movdqa xmm6,xmm7
+db 102,15,58,15,253,8
+ movdqa [16+edx],xmm1
+db 102,15,58,15,193,8
+ movdqa xmm1,xmm7
+ psrlq xmm7,7
+ paddq xmm5,xmm0
+ movdqa xmm0,xmm1
+ psrlq xmm1,1
+ psllq xmm0,56
+ pxor xmm7,xmm1
+ psrlq xmm1,7
+ pxor xmm7,xmm0
+ psllq xmm0,7
+ pxor xmm7,xmm1
+ movdqa xmm1,xmm4
+ pxor xmm7,xmm0
+ movdqa xmm0,xmm4
+ psrlq xmm1,6
+ paddq xmm5,xmm7
+ movdqa xmm7,xmm4
+ psrlq xmm0,19
+ psllq xmm7,3
+ pxor xmm1,xmm0
+ psrlq xmm0,42
+ pxor xmm1,xmm7
+ psllq xmm7,42
+ pxor xmm1,xmm0
+ movdqa xmm0,[48+edx]
+ pxor xmm1,xmm7
+ movdqa xmm7,[80+ebp]
+ movq mm1,mm4
+ paddq xmm5,xmm1
+ movq mm7,[edx-48]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [16+esp],mm4
+ paddq xmm7,xmm5
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [48+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[40+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[8+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[56+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[16+esp]
+ paddq mm2,mm6
+ movq mm6,[24+esp]
+ movq mm1,mm4
+ movq mm7,[edx-40]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [8+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [40+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[32+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[48+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[8+esp]
+ paddq mm0,mm6
+ movq mm6,[16+esp]
+ movdqa [edx-48],xmm7
+ movdqa xmm1,xmm3
+ movdqa xmm7,xmm0
+db 102,15,58,15,198,8
+ movdqa [32+edx],xmm2
+db 102,15,58,15,202,8
+ movdqa xmm2,xmm0
+ psrlq xmm0,7
+ paddq xmm6,xmm1
+ movdqa xmm1,xmm2
+ psrlq xmm2,1
+ psllq xmm1,56
+ pxor xmm0,xmm2
+ psrlq xmm2,7
+ pxor xmm0,xmm1
+ psllq xmm1,7
+ pxor xmm0,xmm2
+ movdqa xmm2,xmm5
+ pxor xmm0,xmm1
+ movdqa xmm1,xmm5
+ psrlq xmm2,6
+ paddq xmm6,xmm0
+ movdqa xmm0,xmm5
+ psrlq xmm1,19
+ psllq xmm0,3
+ pxor xmm2,xmm1
+ psrlq xmm1,42
+ pxor xmm2,xmm0
+ psllq xmm0,42
+ pxor xmm2,xmm1
+ movdqa xmm1,[edx]
+ pxor xmm2,xmm0
+ movdqa xmm0,[96+ebp]
+ movq mm1,mm4
+ paddq xmm6,xmm2
+ movq mm7,[edx-32]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [esp],mm4
+ paddq xmm0,xmm6
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [32+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[24+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[56+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[40+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[esp]
+ paddq mm2,mm6
+ movq mm6,[8+esp]
+ movq mm1,mm4
+ movq mm7,[edx-24]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [56+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [24+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[16+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[48+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[32+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[56+esp]
+ paddq mm0,mm6
+ movq mm6,[esp]
+ movdqa [edx-32],xmm0
+ movdqa xmm2,xmm4
+ movdqa xmm0,xmm1
+db 102,15,58,15,207,8
+ movdqa [48+edx],xmm3
+db 102,15,58,15,211,8
+ movdqa xmm3,xmm1
+ psrlq xmm1,7
+ paddq xmm7,xmm2
+ movdqa xmm2,xmm3
+ psrlq xmm3,1
+ psllq xmm2,56
+ pxor xmm1,xmm3
+ psrlq xmm3,7
+ pxor xmm1,xmm2
+ psllq xmm2,7
+ pxor xmm1,xmm3
+ movdqa xmm3,xmm6
+ pxor xmm1,xmm2
+ movdqa xmm2,xmm6
+ psrlq xmm3,6
+ paddq xmm7,xmm1
+ movdqa xmm1,xmm6
+ psrlq xmm2,19
+ psllq xmm1,3
+ pxor xmm3,xmm2
+ psrlq xmm2,42
+ pxor xmm3,xmm1
+ psllq xmm1,42
+ pxor xmm3,xmm2
+ movdqa xmm2,[16+edx]
+ pxor xmm3,xmm1
+ movdqa xmm1,[112+ebp]
+ movq mm1,mm4
+ paddq xmm7,xmm3
+ movq mm7,[edx-16]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [48+esp],mm4
+ paddq xmm1,xmm7
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [16+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[8+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[40+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[24+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[48+esp]
+ paddq mm2,mm6
+ movq mm6,[56+esp]
+ movq mm1,mm4
+ movq mm7,[edx-8]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [40+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [8+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[32+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[16+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[40+esp]
+ paddq mm0,mm6
+ movq mm6,[48+esp]
+ movdqa [edx-16],xmm1
+ lea ebp,[128+ebp]
+ dec ecx
+ jnz NEAR L$00800_47_ssse3
+ movdqa xmm1,[ebp]
+ lea ebp,[ebp-640]
+ movdqu xmm0,[ebx]
+db 102,15,56,0,193
+ movdqa xmm3,[ebp]
+ movdqa xmm2,xmm1
+ movdqu xmm1,[16+ebx]
+ paddq xmm3,xmm0
+db 102,15,56,0,202
+ movq mm1,mm4
+ movq mm7,[edx-128]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[32+esp]
+ paddq mm2,mm6
+ movq mm6,[40+esp]
+ movq mm1,mm4
+ movq mm7,[edx-120]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [24+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [56+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[48+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[16+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[24+esp]
+ paddq mm0,mm6
+ movq mm6,[32+esp]
+ movdqa [edx-128],xmm3
+ movdqa xmm4,[16+ebp]
+ movdqa xmm3,xmm2
+ movdqu xmm2,[32+ebx]
+ paddq xmm4,xmm1
+db 102,15,56,0,211
+ movq mm1,mm4
+ movq mm7,[edx-112]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [16+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [48+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[40+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[8+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[56+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[16+esp]
+ paddq mm2,mm6
+ movq mm6,[24+esp]
+ movq mm1,mm4
+ movq mm7,[edx-104]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [8+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [40+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[32+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[48+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[8+esp]
+ paddq mm0,mm6
+ movq mm6,[16+esp]
+ movdqa [edx-112],xmm4
+ movdqa xmm5,[32+ebp]
+ movdqa xmm4,xmm3
+ movdqu xmm3,[48+ebx]
+ paddq xmm5,xmm2
+db 102,15,56,0,220
+ movq mm1,mm4
+ movq mm7,[edx-96]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [32+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[24+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[56+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[40+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[esp]
+ paddq mm2,mm6
+ movq mm6,[8+esp]
+ movq mm1,mm4
+ movq mm7,[edx-88]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [56+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [24+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[16+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[48+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[32+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[56+esp]
+ paddq mm0,mm6
+ movq mm6,[esp]
+ movdqa [edx-96],xmm5
+ movdqa xmm6,[48+ebp]
+ movdqa xmm5,xmm4
+ movdqu xmm4,[64+ebx]
+ paddq xmm6,xmm3
+db 102,15,56,0,229
+ movq mm1,mm4
+ movq mm7,[edx-80]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [48+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [16+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[8+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[40+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[24+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[48+esp]
+ paddq mm2,mm6
+ movq mm6,[56+esp]
+ movq mm1,mm4
+ movq mm7,[edx-72]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [40+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [8+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[32+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[16+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[40+esp]
+ paddq mm0,mm6
+ movq mm6,[48+esp]
+ movdqa [edx-80],xmm6
+ movdqa xmm7,[64+ebp]
+ movdqa xmm6,xmm5
+ movdqu xmm5,[80+ebx]
+ paddq xmm7,xmm4
+db 102,15,56,0,238
+ movq mm1,mm4
+ movq mm7,[edx-64]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [32+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[56+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[24+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[8+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[32+esp]
+ paddq mm2,mm6
+ movq mm6,[40+esp]
+ movq mm1,mm4
+ movq mm7,[edx-56]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [24+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [56+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[48+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[16+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[24+esp]
+ paddq mm0,mm6
+ movq mm6,[32+esp]
+ movdqa [edx-64],xmm7
+ movdqa [edx],xmm0
+ movdqa xmm0,[80+ebp]
+ movdqa xmm7,xmm6
+ movdqu xmm6,[96+ebx]
+ paddq xmm0,xmm5
+db 102,15,56,0,247
+ movq mm1,mm4
+ movq mm7,[edx-48]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [16+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [48+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[40+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[8+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[56+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[16+esp]
+ paddq mm2,mm6
+ movq mm6,[24+esp]
+ movq mm1,mm4
+ movq mm7,[edx-40]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [8+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [40+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[32+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[48+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[8+esp]
+ paddq mm0,mm6
+ movq mm6,[16+esp]
+ movdqa [edx-48],xmm0
+ movdqa [16+edx],xmm1
+ movdqa xmm1,[96+ebp]
+ movdqa xmm0,xmm7
+ movdqu xmm7,[112+ebx]
+ paddq xmm1,xmm6
+db 102,15,56,0,248
+ movq mm1,mm4
+ movq mm7,[edx-32]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [32+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[24+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[56+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[40+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[esp]
+ paddq mm2,mm6
+ movq mm6,[8+esp]
+ movq mm1,mm4
+ movq mm7,[edx-24]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [56+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [24+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[16+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[48+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[32+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[56+esp]
+ paddq mm0,mm6
+ movq mm6,[esp]
+ movdqa [edx-32],xmm1
+ movdqa [32+edx],xmm2
+ movdqa xmm2,[112+ebp]
+ movdqa xmm0,[edx]
+ paddq xmm2,xmm7
+ movq mm1,mm4
+ movq mm7,[edx-16]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [48+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm0,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [16+esp],mm0
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[8+esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[40+esp]
+ paddq mm3,mm7
+ movq mm5,mm0
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm0
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[24+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm0,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm2,mm0
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm2,mm1
+ pxor mm6,mm7
+ movq mm5,[48+esp]
+ paddq mm2,mm6
+ movq mm6,[56+esp]
+ movq mm1,mm4
+ movq mm7,[edx-8]
+ pxor mm5,mm6
+ psrlq mm1,14
+ movq [40+esp],mm4
+ pand mm5,mm4
+ psllq mm4,23
+ paddq mm2,mm3
+ movq mm3,mm1
+ psrlq mm1,4
+ pxor mm5,mm6
+ pxor mm3,mm4
+ psllq mm4,23
+ pxor mm3,mm1
+ movq [8+esp],mm2
+ paddq mm7,mm5
+ pxor mm3,mm4
+ psrlq mm1,23
+ paddq mm7,[esp]
+ pxor mm3,mm1
+ psllq mm4,4
+ pxor mm3,mm4
+ movq mm4,[32+esp]
+ paddq mm3,mm7
+ movq mm5,mm2
+ psrlq mm5,28
+ paddq mm4,mm3
+ movq mm6,mm2
+ movq mm7,mm5
+ psllq mm6,25
+ movq mm1,[16+esp]
+ psrlq mm5,6
+ pxor mm7,mm6
+ psllq mm6,5
+ pxor mm7,mm5
+ pxor mm2,mm1
+ psrlq mm5,5
+ pxor mm7,mm6
+ pand mm0,mm2
+ psllq mm6,6
+ pxor mm7,mm5
+ pxor mm0,mm1
+ pxor mm6,mm7
+ movq mm5,[40+esp]
+ paddq mm0,mm6
+ movq mm6,[48+esp]
+ movdqa [edx-16],xmm2
+ movq mm1,[8+esp]
+ paddq mm0,mm3
+ movq mm3,[24+esp]
+ movq mm7,[56+esp]
+ pxor mm2,mm1
+ paddq mm0,[esi]
+ paddq mm1,[8+esi]
+ paddq mm2,[16+esi]
+ paddq mm3,[24+esi]
+ paddq mm4,[32+esi]
+ paddq mm5,[40+esi]
+ paddq mm6,[48+esi]
+ paddq mm7,[56+esi]
+ movq [esi],mm0
+ movq [8+esi],mm1
+ movq [16+esi],mm2
+ movq [24+esi],mm3
+ movq [32+esi],mm4
+ movq [40+esi],mm5
+ movq [48+esi],mm6
+ movq [56+esi],mm7
+ cmp edi,eax
+ jb NEAR L$007loop_ssse3
+ mov esp,DWORD [76+edx]
+ emms
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 16
+L$002loop_x86:
+ mov eax,DWORD [edi]
+ mov ebx,DWORD [4+edi]
+ mov ecx,DWORD [8+edi]
+ mov edx,DWORD [12+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ mov eax,DWORD [16+edi]
+ mov ebx,DWORD [20+edi]
+ mov ecx,DWORD [24+edi]
+ mov edx,DWORD [28+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ mov eax,DWORD [32+edi]
+ mov ebx,DWORD [36+edi]
+ mov ecx,DWORD [40+edi]
+ mov edx,DWORD [44+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ mov eax,DWORD [48+edi]
+ mov ebx,DWORD [52+edi]
+ mov ecx,DWORD [56+edi]
+ mov edx,DWORD [60+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ mov eax,DWORD [64+edi]
+ mov ebx,DWORD [68+edi]
+ mov ecx,DWORD [72+edi]
+ mov edx,DWORD [76+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ mov eax,DWORD [80+edi]
+ mov ebx,DWORD [84+edi]
+ mov ecx,DWORD [88+edi]
+ mov edx,DWORD [92+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ mov eax,DWORD [96+edi]
+ mov ebx,DWORD [100+edi]
+ mov ecx,DWORD [104+edi]
+ mov edx,DWORD [108+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ mov eax,DWORD [112+edi]
+ mov ebx,DWORD [116+edi]
+ mov ecx,DWORD [120+edi]
+ mov edx,DWORD [124+edi]
+ bswap eax
+ bswap ebx
+ bswap ecx
+ bswap edx
+ push eax
+ push ebx
+ push ecx
+ push edx
+ add edi,128
+ sub esp,72
+ mov DWORD [204+esp],edi
+ lea edi,[8+esp]
+ mov ecx,16
+dd 2784229001
+align 16
+L$00900_15_x86:
+ mov ecx,DWORD [40+esp]
+ mov edx,DWORD [44+esp]
+ mov esi,ecx
+ shr ecx,9
+ mov edi,edx
+ shr edx,9
+ mov ebx,ecx
+ shl esi,14
+ mov eax,edx
+ shl edi,14
+ xor ebx,esi
+ shr ecx,5
+ xor eax,edi
+ shr edx,5
+ xor eax,ecx
+ shl esi,4
+ xor ebx,edx
+ shl edi,4
+ xor ebx,esi
+ shr ecx,4
+ xor eax,edi
+ shr edx,4
+ xor eax,ecx
+ shl esi,5
+ xor ebx,edx
+ shl edi,5
+ xor eax,esi
+ xor ebx,edi
+ mov ecx,DWORD [48+esp]
+ mov edx,DWORD [52+esp]
+ mov esi,DWORD [56+esp]
+ mov edi,DWORD [60+esp]
+ add eax,DWORD [64+esp]
+ adc ebx,DWORD [68+esp]
+ xor ecx,esi
+ xor edx,edi
+ and ecx,DWORD [40+esp]
+ and edx,DWORD [44+esp]
+ add eax,DWORD [192+esp]
+ adc ebx,DWORD [196+esp]
+ xor ecx,esi
+ xor edx,edi
+ mov esi,DWORD [ebp]
+ mov edi,DWORD [4+ebp]
+ add eax,ecx
+ adc ebx,edx
+ mov ecx,DWORD [32+esp]
+ mov edx,DWORD [36+esp]
+ add eax,esi
+ adc ebx,edi
+ mov DWORD [esp],eax
+ mov DWORD [4+esp],ebx
+ add eax,ecx
+ adc ebx,edx
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ mov DWORD [32+esp],eax
+ mov DWORD [36+esp],ebx
+ mov esi,ecx
+ shr ecx,2
+ mov edi,edx
+ shr edx,2
+ mov ebx,ecx
+ shl esi,4
+ mov eax,edx
+ shl edi,4
+ xor ebx,esi
+ shr ecx,5
+ xor eax,edi
+ shr edx,5
+ xor ebx,ecx
+ shl esi,21
+ xor eax,edx
+ shl edi,21
+ xor eax,esi
+ shr ecx,21
+ xor ebx,edi
+ shr edx,21
+ xor eax,ecx
+ shl esi,5
+ xor ebx,edx
+ shl edi,5
+ xor eax,esi
+ xor ebx,edi
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ mov esi,DWORD [16+esp]
+ mov edi,DWORD [20+esp]
+ add eax,DWORD [esp]
+ adc ebx,DWORD [4+esp]
+ or ecx,esi
+ or edx,edi
+ and ecx,DWORD [24+esp]
+ and edx,DWORD [28+esp]
+ and esi,DWORD [8+esp]
+ and edi,DWORD [12+esp]
+ or ecx,esi
+ or edx,edi
+ add eax,ecx
+ adc ebx,edx
+ mov DWORD [esp],eax
+ mov DWORD [4+esp],ebx
+ mov dl,BYTE [ebp]
+ sub esp,8
+ lea ebp,[8+ebp]
+ cmp dl,148
+ jne NEAR L$00900_15_x86
+align 16
+L$01016_79_x86:
+ mov ecx,DWORD [312+esp]
+ mov edx,DWORD [316+esp]
+ mov esi,ecx
+ shr ecx,1
+ mov edi,edx
+ shr edx,1
+ mov eax,ecx
+ shl esi,24
+ mov ebx,edx
+ shl edi,24
+ xor ebx,esi
+ shr ecx,6
+ xor eax,edi
+ shr edx,6
+ xor eax,ecx
+ shl esi,7
+ xor ebx,edx
+ shl edi,1
+ xor ebx,esi
+ shr ecx,1
+ xor eax,edi
+ shr edx,1
+ xor eax,ecx
+ shl edi,6
+ xor ebx,edx
+ xor eax,edi
+ mov DWORD [esp],eax
+ mov DWORD [4+esp],ebx
+ mov ecx,DWORD [208+esp]
+ mov edx,DWORD [212+esp]
+ mov esi,ecx
+ shr ecx,6
+ mov edi,edx
+ shr edx,6
+ mov eax,ecx
+ shl esi,3
+ mov ebx,edx
+ shl edi,3
+ xor eax,esi
+ shr ecx,13
+ xor ebx,edi
+ shr edx,13
+ xor eax,ecx
+ shl esi,10
+ xor ebx,edx
+ shl edi,10
+ xor ebx,esi
+ shr ecx,10
+ xor eax,edi
+ shr edx,10
+ xor ebx,ecx
+ shl edi,13
+ xor eax,edx
+ xor eax,edi
+ mov ecx,DWORD [320+esp]
+ mov edx,DWORD [324+esp]
+ add eax,DWORD [esp]
+ adc ebx,DWORD [4+esp]
+ mov esi,DWORD [248+esp]
+ mov edi,DWORD [252+esp]
+ add eax,ecx
+ adc ebx,edx
+ add eax,esi
+ adc ebx,edi
+ mov DWORD [192+esp],eax
+ mov DWORD [196+esp],ebx
+ mov ecx,DWORD [40+esp]
+ mov edx,DWORD [44+esp]
+ mov esi,ecx
+ shr ecx,9
+ mov edi,edx
+ shr edx,9
+ mov ebx,ecx
+ shl esi,14
+ mov eax,edx
+ shl edi,14
+ xor ebx,esi
+ shr ecx,5
+ xor eax,edi
+ shr edx,5
+ xor eax,ecx
+ shl esi,4
+ xor ebx,edx
+ shl edi,4
+ xor ebx,esi
+ shr ecx,4
+ xor eax,edi
+ shr edx,4
+ xor eax,ecx
+ shl esi,5
+ xor ebx,edx
+ shl edi,5
+ xor eax,esi
+ xor ebx,edi
+ mov ecx,DWORD [48+esp]
+ mov edx,DWORD [52+esp]
+ mov esi,DWORD [56+esp]
+ mov edi,DWORD [60+esp]
+ add eax,DWORD [64+esp]
+ adc ebx,DWORD [68+esp]
+ xor ecx,esi
+ xor edx,edi
+ and ecx,DWORD [40+esp]
+ and edx,DWORD [44+esp]
+ add eax,DWORD [192+esp]
+ adc ebx,DWORD [196+esp]
+ xor ecx,esi
+ xor edx,edi
+ mov esi,DWORD [ebp]
+ mov edi,DWORD [4+ebp]
+ add eax,ecx
+ adc ebx,edx
+ mov ecx,DWORD [32+esp]
+ mov edx,DWORD [36+esp]
+ add eax,esi
+ adc ebx,edi
+ mov DWORD [esp],eax
+ mov DWORD [4+esp],ebx
+ add eax,ecx
+ adc ebx,edx
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ mov DWORD [32+esp],eax
+ mov DWORD [36+esp],ebx
+ mov esi,ecx
+ shr ecx,2
+ mov edi,edx
+ shr edx,2
+ mov ebx,ecx
+ shl esi,4
+ mov eax,edx
+ shl edi,4
+ xor ebx,esi
+ shr ecx,5
+ xor eax,edi
+ shr edx,5
+ xor ebx,ecx
+ shl esi,21
+ xor eax,edx
+ shl edi,21
+ xor eax,esi
+ shr ecx,21
+ xor ebx,edi
+ shr edx,21
+ xor eax,ecx
+ shl esi,5
+ xor ebx,edx
+ shl edi,5
+ xor eax,esi
+ xor ebx,edi
+ mov ecx,DWORD [8+esp]
+ mov edx,DWORD [12+esp]
+ mov esi,DWORD [16+esp]
+ mov edi,DWORD [20+esp]
+ add eax,DWORD [esp]
+ adc ebx,DWORD [4+esp]
+ or ecx,esi
+ or edx,edi
+ and ecx,DWORD [24+esp]
+ and edx,DWORD [28+esp]
+ and esi,DWORD [8+esp]
+ and edi,DWORD [12+esp]
+ or ecx,esi
+ or edx,edi
+ add eax,ecx
+ adc ebx,edx
+ mov DWORD [esp],eax
+ mov DWORD [4+esp],ebx
+ mov dl,BYTE [ebp]
+ sub esp,8
+ lea ebp,[8+ebp]
+ cmp dl,23
+ jne NEAR L$01016_79_x86
+ mov esi,DWORD [840+esp]
+ mov edi,DWORD [844+esp]
+ mov eax,DWORD [esi]
+ mov ebx,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov edx,DWORD [12+esi]
+ add eax,DWORD [8+esp]
+ adc ebx,DWORD [12+esp]
+ mov DWORD [esi],eax
+ mov DWORD [4+esi],ebx
+ add ecx,DWORD [16+esp]
+ adc edx,DWORD [20+esp]
+ mov DWORD [8+esi],ecx
+ mov DWORD [12+esi],edx
+ mov eax,DWORD [16+esi]
+ mov ebx,DWORD [20+esi]
+ mov ecx,DWORD [24+esi]
+ mov edx,DWORD [28+esi]
+ add eax,DWORD [24+esp]
+ adc ebx,DWORD [28+esp]
+ mov DWORD [16+esi],eax
+ mov DWORD [20+esi],ebx
+ add ecx,DWORD [32+esp]
+ adc edx,DWORD [36+esp]
+ mov DWORD [24+esi],ecx
+ mov DWORD [28+esi],edx
+ mov eax,DWORD [32+esi]
+ mov ebx,DWORD [36+esi]
+ mov ecx,DWORD [40+esi]
+ mov edx,DWORD [44+esi]
+ add eax,DWORD [40+esp]
+ adc ebx,DWORD [44+esp]
+ mov DWORD [32+esi],eax
+ mov DWORD [36+esi],ebx
+ add ecx,DWORD [48+esp]
+ adc edx,DWORD [52+esp]
+ mov DWORD [40+esi],ecx
+ mov DWORD [44+esi],edx
+ mov eax,DWORD [48+esi]
+ mov ebx,DWORD [52+esi]
+ mov ecx,DWORD [56+esi]
+ mov edx,DWORD [60+esi]
+ add eax,DWORD [56+esp]
+ adc ebx,DWORD [60+esp]
+ mov DWORD [48+esi],eax
+ mov DWORD [52+esi],ebx
+ add ecx,DWORD [64+esp]
+ adc edx,DWORD [68+esp]
+ mov DWORD [56+esi],ecx
+ mov DWORD [60+esi],edx
+ add esp,840
+ sub ebp,640
+ cmp edi,DWORD [8+esp]
+ jb NEAR L$002loop_x86
+ mov esp,DWORD [12+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 64
+L$001K512:
+dd 3609767458,1116352408
+dd 602891725,1899447441
+dd 3964484399,3049323471
+dd 2173295548,3921009573
+dd 4081628472,961987163
+dd 3053834265,1508970993
+dd 2937671579,2453635748
+dd 3664609560,2870763221
+dd 2734883394,3624381080
+dd 1164996542,310598401
+dd 1323610764,607225278
+dd 3590304994,1426881987
+dd 4068182383,1925078388
+dd 991336113,2162078206
+dd 633803317,2614888103
+dd 3479774868,3248222580
+dd 2666613458,3835390401
+dd 944711139,4022224774
+dd 2341262773,264347078
+dd 2007800933,604807628
+dd 1495990901,770255983
+dd 1856431235,1249150122
+dd 3175218132,1555081692
+dd 2198950837,1996064986
+dd 3999719339,2554220882
+dd 766784016,2821834349
+dd 2566594879,2952996808
+dd 3203337956,3210313671
+dd 1034457026,3336571891
+dd 2466948901,3584528711
+dd 3758326383,113926993
+dd 168717936,338241895
+dd 1188179964,666307205
+dd 1546045734,773529912
+dd 1522805485,1294757372
+dd 2643833823,1396182291
+dd 2343527390,1695183700
+dd 1014477480,1986661051
+dd 1206759142,2177026350
+dd 344077627,2456956037
+dd 1290863460,2730485921
+dd 3158454273,2820302411
+dd 3505952657,3259730800
+dd 106217008,3345764771
+dd 3606008344,3516065817
+dd 1432725776,3600352804
+dd 1467031594,4094571909
+dd 851169720,275423344
+dd 3100823752,430227734
+dd 1363258195,506948616
+dd 3750685593,659060556
+dd 3785050280,883997877
+dd 3318307427,958139571
+dd 3812723403,1322822218
+dd 2003034995,1537002063
+dd 3602036899,1747873779
+dd 1575990012,1955562222
+dd 1125592928,2024104815
+dd 2716904306,2227730452
+dd 442776044,2361852424
+dd 593698344,2428436474
+dd 3733110249,2756734187
+dd 2999351573,3204031479
+dd 3815920427,3329325298
+dd 3928383900,3391569614
+dd 566280711,3515267271
+dd 3454069534,3940187606
+dd 4000239992,4118630271
+dd 1914138554,116418474
+dd 2731055270,174292421
+dd 3203993006,289380356
+dd 320620315,460393269
+dd 587496836,685471733
+dd 1086792851,852142971
+dd 365543100,1017036298
+dd 2618297676,1126000580
+dd 3409855158,1288033470
+dd 4234509866,1501505948
+dd 987167468,1607167915
+dd 1246189591,1816402316
+dd 67438087,66051
+dd 202182159,134810123
+db 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+db 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+db 62,0
+segment .bss
+common _OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/sha512-armv4-linux.S b/gen/bcm/sha512-armv4-linux.S
new file mode 100644
index 0000000..5500686
--- /dev/null
+++ b/gen/bcm/sha512-armv4-linux.S
@@ -0,0 +1,1855 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch armv7-a
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+# define adrl adr
+#else
+.code 32
+#endif
+
+.type K512,%object
+.align 5
+K512:
+ WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+ WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+ WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+ WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+ WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+ WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+ WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+ WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+ WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+ WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+ WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+ WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+ WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+ WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+ WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+ WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+ WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+ WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+ WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+ WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+ WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+ WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+ WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+ WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+ WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+ WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+ WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+ WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+ WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+ WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+ WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+ WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+ WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+ WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+ WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+ WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+ WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size K512,.-K512
+
+.globl sha512_block_data_order_nohw
+.hidden sha512_block_data_order_nohw
+.type sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ adr r14,K512
+ sub sp,sp,#9*8
+
+ ldr r7,[r0,#32+LO]
+ ldr r8,[r0,#32+HI]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+.Loop:
+ str r9, [sp,#48+0]
+ str r10, [sp,#48+4]
+ str r11, [sp,#56+0]
+ str r12, [sp,#56+4]
+ ldr r5,[r0,#0+LO]
+ ldr r6,[r0,#0+HI]
+ ldr r3,[r0,#8+LO]
+ ldr r4,[r0,#8+HI]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ str r3,[sp,#8+0]
+ str r4,[sp,#8+4]
+ str r9, [sp,#16+0]
+ str r10, [sp,#16+4]
+ str r11, [sp,#24+0]
+ str r12, [sp,#24+4]
+ ldr r3,[r0,#40+LO]
+ ldr r4,[r0,#40+HI]
+ str r3,[sp,#40+0]
+ str r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH<7
+ ldrb r3,[r1,#7]
+ ldrb r9, [r1,#6]
+ ldrb r10, [r1,#5]
+ ldrb r11, [r1,#4]
+ ldrb r4,[r1,#3]
+ ldrb r12, [r1,#2]
+ orr r3,r3,r9,lsl#8
+ ldrb r9, [r1,#1]
+ orr r3,r3,r10,lsl#16
+ ldrb r10, [r1],#8
+ orr r3,r3,r11,lsl#24
+ orr r4,r4,r12,lsl#8
+ orr r4,r4,r9,lsl#16
+ orr r4,r4,r10,lsl#24
+#else
+ ldr r3,[r1,#4]
+ ldr r4,[r1],#8
+#ifdef __ARMEL__
+ rev r3,r3
+ rev r4,r4
+#endif
+#endif
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#148
+
+ ldr r12,[sp,#16+0] @ c.lo
+#if __ARM_ARCH>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+ tst r14,#1
+ beq .L00_15
+ ldr r9,[sp,#184+0]
+ ldr r10,[sp,#184+4]
+ bic r14,r14,#1
+.L16_79:
+ @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+ @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+ @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
+ mov r3,r9,lsr#1
+ ldr r11,[sp,#80+0]
+ mov r4,r10,lsr#1
+ ldr r12,[sp,#80+4]
+ eor r3,r3,r10,lsl#31
+ eor r4,r4,r9,lsl#31
+ eor r3,r3,r9,lsr#8
+ eor r4,r4,r10,lsr#8
+ eor r3,r3,r10,lsl#24
+ eor r4,r4,r9,lsl#24
+ eor r3,r3,r9,lsr#7
+ eor r4,r4,r10,lsr#7
+ eor r3,r3,r10,lsl#25
+
+ @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+ @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+ @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+ mov r9,r11,lsr#19
+ mov r10,r12,lsr#19
+ eor r9,r9,r12,lsl#13
+ eor r10,r10,r11,lsl#13
+ eor r9,r9,r12,lsr#29
+ eor r10,r10,r11,lsr#29
+ eor r9,r9,r11,lsl#3
+ eor r10,r10,r12,lsl#3
+ eor r9,r9,r11,lsr#6
+ eor r10,r10,r12,lsr#6
+ ldr r11,[sp,#120+0]
+ eor r9,r9,r12,lsl#26
+
+ ldr r12,[sp,#120+4]
+ adds r3,r3,r9
+ ldr r9,[sp,#192+0]
+ adc r4,r4,r10
+
+ ldr r10,[sp,#192+4]
+ adds r3,r3,r11
+ adc r4,r4,r12
+ adds r3,r3,r9
+ adc r4,r4,r10
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#23
+
+ ldr r12,[sp,#16+0] @ c.lo
+#if __ARM_ARCH>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+#if __ARM_ARCH>=7
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r9,[sp,#184+0]
+ ldreq r10,[sp,#184+4]
+ beq .L16_79
+ bic r14,r14,#1
+
+ ldr r3,[sp,#8+0]
+ ldr r4,[sp,#8+4]
+ ldr r9, [r0,#0+LO]
+ ldr r10, [r0,#0+HI]
+ ldr r11, [r0,#8+LO]
+ ldr r12, [r0,#8+HI]
+ adds r9,r5,r9
+ str r9, [r0,#0+LO]
+ adc r10,r6,r10
+ str r10, [r0,#0+HI]
+ adds r11,r3,r11
+ str r11, [r0,#8+LO]
+ adc r12,r4,r12
+ str r12, [r0,#8+HI]
+
+ ldr r5,[sp,#16+0]
+ ldr r6,[sp,#16+4]
+ ldr r3,[sp,#24+0]
+ ldr r4,[sp,#24+4]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ adds r9,r5,r9
+ str r9, [r0,#16+LO]
+ adc r10,r6,r10
+ str r10, [r0,#16+HI]
+ adds r11,r3,r11
+ str r11, [r0,#24+LO]
+ adc r12,r4,r12
+ str r12, [r0,#24+HI]
+
+ ldr r3,[sp,#40+0]
+ ldr r4,[sp,#40+4]
+ ldr r9, [r0,#32+LO]
+ ldr r10, [r0,#32+HI]
+ ldr r11, [r0,#40+LO]
+ ldr r12, [r0,#40+HI]
+ adds r7,r7,r9
+ str r7,[r0,#32+LO]
+ adc r8,r8,r10
+ str r8,[r0,#32+HI]
+ adds r11,r3,r11
+ str r11, [r0,#40+LO]
+ adc r12,r4,r12
+ str r12, [r0,#40+HI]
+
+ ldr r5,[sp,#48+0]
+ ldr r6,[sp,#48+4]
+ ldr r3,[sp,#56+0]
+ ldr r4,[sp,#56+4]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+ adds r9,r5,r9
+ str r9, [r0,#48+LO]
+ adc r10,r6,r10
+ str r10, [r0,#48+HI]
+ adds r11,r3,r11
+ str r11, [r0,#56+LO]
+ adc r12,r4,r12
+ str r12, [r0,#56+HI]
+
+ add sp,sp,#640
+ sub r14,r14,#640
+
+ teq r1,r2
+ bne .Loop
+
+ add sp,sp,#8*9 @ destroy frame
+#if __ARM_ARCH>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl sha512_block_data_order_neon
+.hidden sha512_block_data_order_neon
+.type sha512_block_data_order_neon,%function
+.align 4
+sha512_block_data_order_neon:
+ dmb @ errata #451034 on early Cortex A8
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ adr r3,K512
+ VFP_ABI_PUSH
+ vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context
+.Loop_neon:
+ vshr.u64 d24,d20,#14 @ 0
+#if 0<16
+ vld1.64 {d0},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 0>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+ vrev64.8 d0,d0
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 1
+#if 1<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 1>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+ vrev64.8 d1,d1
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 2
+#if 2<16
+ vld1.64 {d2},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 2>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+ vrev64.8 d2,d2
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 3
+#if 3<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 3>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+ vrev64.8 d3,d3
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 4
+#if 4<16
+ vld1.64 {d4},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 4>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+ vrev64.8 d4,d4
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 5
+#if 5<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 5>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+ vrev64.8 d5,d5
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 6
+#if 6<16
+ vld1.64 {d6},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 6>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+ vrev64.8 d6,d6
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 7
+#if 7<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 7>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+ vrev64.8 d7,d7
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 d24,d20,#14 @ 8
+#if 8<16
+ vld1.64 {d8},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 8>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+ vrev64.8 d8,d8
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 9
+#if 9<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 9>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+ vrev64.8 d9,d9
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 10
+#if 10<16
+ vld1.64 {d10},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 10>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+ vrev64.8 d10,d10
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 11
+#if 11<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 11>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+ vrev64.8 d11,d11
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 12
+#if 12<16
+ vld1.64 {d12},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 12>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+ vrev64.8 d12,d12
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 13
+#if 13<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 13>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+ vrev64.8 d13,d13
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 14
+#if 14<16
+ vld1.64 {d14},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 14>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+ vrev64.8 d14,d14
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 15
+#if 15<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 15>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+ vrev64.8 d15,d15
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ mov r12,#4
+.L16_79_neon:
+ subs r12,#1
+ vshr.u64 q12,q7,#19
+ vshr.u64 q13,q7,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q7,#6
+ vsli.64 q12,q7,#45
+ vext.8 q14,q0,q1,#8 @ X[i+1]
+ vsli.64 q13,q7,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q0,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q4,q5,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q0,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q0,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 17
+#if 17<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 17>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q0,#19
+ vshr.u64 q13,q0,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q0,#6
+ vsli.64 q12,q0,#45
+ vext.8 q14,q1,q2,#8 @ X[i+1]
+ vsli.64 q13,q0,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q1,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q5,q6,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q1,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q1,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 19
+#if 19<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 19>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q1,#19
+ vshr.u64 q13,q1,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q1,#6
+ vsli.64 q12,q1,#45
+ vext.8 q14,q2,q3,#8 @ X[i+1]
+ vsli.64 q13,q1,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q2,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q6,q7,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q2,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q2,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 21
+#if 21<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 21>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q2,#19
+ vshr.u64 q13,q2,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q2,#6
+ vsli.64 q12,q2,#45
+ vext.8 q14,q3,q4,#8 @ X[i+1]
+ vsli.64 q13,q2,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q3,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q7,q0,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q3,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q3,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 23
+#if 23<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 23>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 q12,q3,#19
+ vshr.u64 q13,q3,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q3,#6
+ vsli.64 q12,q3,#45
+ vext.8 q14,q4,q5,#8 @ X[i+1]
+ vsli.64 q13,q3,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q4,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q0,q1,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q4,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q4,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 25
+#if 25<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 25>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q4,#19
+ vshr.u64 q13,q4,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q4,#6
+ vsli.64 q12,q4,#45
+ vext.8 q14,q5,q6,#8 @ X[i+1]
+ vsli.64 q13,q4,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q5,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q1,q2,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q5,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q5,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 27
+#if 27<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 27>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q5,#19
+ vshr.u64 q13,q5,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q5,#6
+ vsli.64 q12,q5,#45
+ vext.8 q14,q6,q7,#8 @ X[i+1]
+ vsli.64 q13,q5,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q6,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q2,q3,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q6,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q6,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 29
+#if 29<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 29>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q6,#19
+ vshr.u64 q13,q6,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q6,#6
+ vsli.64 q12,q6,#45
+ vext.8 q14,q7,q0,#8 @ X[i+1]
+ vsli.64 q13,q6,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q7,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q3,q4,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q7,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q7,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 31
+#if 31<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 31>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ bne .L16_79_neon
+
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context
+ teq r1,r2
+ sub r3,#640 @ rewind K512
+ bne .Loop_neon
+
+ VFP_ABI_POP
+ bx lr @ .word 0xe12fff1e
+.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/sha512-armv8-apple.S b/gen/bcm/sha512-armv8-apple.S
new file mode 100644
index 0000000..8c98e06
--- /dev/null
+++ b/gen/bcm/sha512-armv8-apple.S
@@ -0,0 +1,1596 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl _sha512_block_data_order_nohw
+.private_extern _sha512_block_data_order_nohw
+
+.align 6
+_sha512_block_data_order_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*8
+
+ ldp x20,x21,[x0] // load context
+ ldp x22,x23,[x0,#2*8]
+ ldp x24,x25,[x0,#4*8]
+ add x2,x1,x2,lsl#7 // end of input
+ ldp x26,x27,[x0,#6*8]
+ adrp x30,LK512@PAGE
+ add x30,x30,LK512@PAGEOFF
+ stp x0,x2,[x29,#96]
+
+Loop:
+ ldp x3,x4,[x1],#2*8
+ ldr x19,[x30],#8 // *K++
+ eor x28,x21,x22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev x3,x3 // 0
+#endif
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x6,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x3 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x4,x4 // 1
+#endif
+ ldp x5,x6,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x7,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x4 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x5,x5 // 2
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x8,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x5 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x6,x6 // 3
+#endif
+ ldp x7,x8,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x9,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x6 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x7,x7 // 4
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x10,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x7 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x10,ror#18 // Sigma1(e)
+ ror x10,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x10,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x8,x8 // 5
+#endif
+ ldp x9,x10,[x1],#2*8
+ add x23,x23,x17 // h+=Sigma0(a)
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x11,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x8 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x11,ror#18 // Sigma1(e)
+ ror x11,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x11,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x9,x9 // 6
+#endif
+ add x22,x22,x17 // h+=Sigma0(a)
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x12,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x9 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x12,ror#18 // Sigma1(e)
+ ror x12,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x12,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x10,x10 // 7
+#endif
+ ldp x11,x12,[x1],#2*8
+ add x21,x21,x17 // h+=Sigma0(a)
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ eor x13,x25,x25,ror#23
+ and x17,x26,x25
+ bic x28,x27,x25
+ add x20,x20,x10 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x13,ror#18 // Sigma1(e)
+ ror x13,x21,#28
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ eor x17,x21,x21,ror#5
+ add x20,x20,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x24,x24,x20 // d+=h
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x13,x17,ror#34 // Sigma0(a)
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x20,x20,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x11,x11 // 8
+#endif
+ add x20,x20,x17 // h+=Sigma0(a)
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x14,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x11 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x14,ror#18 // Sigma1(e)
+ ror x14,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x14,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x12,x12 // 9
+#endif
+ ldp x13,x14,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x15,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x12 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x15,ror#18 // Sigma1(e)
+ ror x15,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x15,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x13,x13 // 10
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x0,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x13 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x0,ror#18 // Sigma1(e)
+ ror x0,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x0,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x14,x14 // 11
+#endif
+ ldp x15,x0,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x6,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x14 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x15,x15 // 12
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x7,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x15 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x0,x0 // 13
+#endif
+ ldp x1,x2,[x1]
+ add x23,x23,x17 // h+=Sigma0(a)
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x8,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x0 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x1,x1 // 14
+#endif
+ ldr x6,[sp,#24]
+ add x22,x22,x17 // h+=Sigma0(a)
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x9,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x1 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x2,x2 // 15
+#endif
+ ldr x7,[sp,#0]
+ add x21,x21,x17 // h+=Sigma0(a)
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+Loop_16_xx:
+ ldr x8,[sp,#8]
+ str x11,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x10,x5,#1
+ and x17,x25,x24
+ ror x9,x2,#19
+ bic x19,x26,x24
+ ror x11,x20,#28
+ add x27,x27,x3 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x10,x10,x5,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x11,x11,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x9,x9,x2,ror#61
+ eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x11,x20,ror#39 // Sigma0(a)
+ eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
+ add x4,x4,x13
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x4,x4,x10
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x4,x4,x9
+ ldr x9,[sp,#16]
+ str x12,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x11,x6,#1
+ and x17,x24,x23
+ ror x10,x3,#19
+ bic x28,x25,x23
+ ror x12,x27,#28
+ add x26,x26,x4 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x11,x11,x6,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x12,x12,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x10,x10,x3,ror#61
+ eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x12,x27,ror#39 // Sigma0(a)
+ eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
+ add x5,x5,x14
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x5,x5,x11
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x5,x5,x10
+ ldr x10,[sp,#24]
+ str x13,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x12,x7,#1
+ and x17,x23,x22
+ ror x11,x4,#19
+ bic x19,x24,x22
+ ror x13,x26,#28
+ add x25,x25,x5 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x12,x12,x7,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x13,x13,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x11,x11,x4,ror#61
+ eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x13,x26,ror#39 // Sigma0(a)
+ eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
+ add x6,x6,x15
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x6,x6,x12
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x6,x6,x11
+ ldr x11,[sp,#0]
+ str x14,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x13,x8,#1
+ and x17,x22,x21
+ ror x12,x5,#19
+ bic x28,x23,x21
+ ror x14,x25,#28
+ add x24,x24,x6 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x13,x13,x8,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x14,x14,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x12,x12,x5,ror#61
+ eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x14,x25,ror#39 // Sigma0(a)
+ eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
+ add x7,x7,x0
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x7,x7,x13
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x7,x7,x12
+ ldr x12,[sp,#8]
+ str x15,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x14,x9,#1
+ and x17,x21,x20
+ ror x13,x6,#19
+ bic x19,x22,x20
+ ror x15,x24,#28
+ add x23,x23,x7 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x14,x14,x9,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x15,x15,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x13,x13,x6,ror#61
+ eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x15,x24,ror#39 // Sigma0(a)
+ eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
+ add x8,x8,x1
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x8,x8,x14
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x8,x8,x13
+ ldr x13,[sp,#16]
+ str x0,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x15,x10,#1
+ and x17,x20,x27
+ ror x14,x7,#19
+ bic x28,x21,x27
+ ror x0,x23,#28
+ add x22,x22,x8 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x15,x15,x10,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x0,x0,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x14,x14,x7,ror#61
+ eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x0,x23,ror#39 // Sigma0(a)
+ eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
+ add x9,x9,x2
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x9,x9,x15
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x9,x9,x14
+ ldr x14,[sp,#24]
+ str x1,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x0,x11,#1
+ and x17,x27,x26
+ ror x15,x8,#19
+ bic x19,x20,x26
+ ror x1,x22,#28
+ add x21,x21,x9 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x0,x0,x11,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x1,x1,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x15,x15,x8,ror#61
+ eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x1,x22,ror#39 // Sigma0(a)
+ eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
+ add x10,x10,x3
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x10,x10,x0
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x10,x10,x15
+ ldr x15,[sp,#0]
+ str x2,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x1,x12,#1
+ and x17,x26,x25
+ ror x0,x9,#19
+ bic x28,x27,x25
+ ror x2,x21,#28
+ add x20,x20,x10 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x1,x1,x12,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x2,x2,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x0,x0,x9,ror#61
+ eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x2,x21,ror#39 // Sigma0(a)
+ eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
+ add x11,x11,x4
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x11,x11,x1
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x11,x11,x0
+ ldr x0,[sp,#8]
+ str x3,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x2,x13,#1
+ and x17,x25,x24
+ ror x1,x10,#19
+ bic x19,x26,x24
+ ror x3,x20,#28
+ add x27,x27,x11 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x2,x2,x13,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x3,x3,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x1,x1,x10,ror#61
+ eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x3,x20,ror#39 // Sigma0(a)
+ eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
+ add x12,x12,x5
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x12,x12,x2
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x12,x12,x1
+ ldr x1,[sp,#16]
+ str x4,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x3,x14,#1
+ and x17,x24,x23
+ ror x2,x11,#19
+ bic x28,x25,x23
+ ror x4,x27,#28
+ add x26,x26,x12 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x3,x3,x14,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x4,x4,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x2,x2,x11,ror#61
+ eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x4,x27,ror#39 // Sigma0(a)
+ eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
+ add x13,x13,x6
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x13,x13,x3
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x13,x13,x2
+ ldr x2,[sp,#24]
+ str x5,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x4,x15,#1
+ and x17,x23,x22
+ ror x3,x12,#19
+ bic x19,x24,x22
+ ror x5,x26,#28
+ add x25,x25,x13 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x4,x4,x15,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x5,x5,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x3,x3,x12,ror#61
+ eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x5,x26,ror#39 // Sigma0(a)
+ eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
+ add x14,x14,x7
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x14,x14,x4
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x14,x14,x3
+ ldr x3,[sp,#0]
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x5,x0,#1
+ and x17,x22,x21
+ ror x4,x13,#19
+ bic x28,x23,x21
+ ror x6,x25,#28
+ add x24,x24,x14 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x5,x5,x0,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x6,x6,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x4,x4,x13,ror#61
+ eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x25,ror#39 // Sigma0(a)
+ eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
+ add x15,x15,x8
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x15,x15,x5
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x15,x15,x4
+ ldr x4,[sp,#8]
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x6,x1,#1
+ and x17,x21,x20
+ ror x5,x14,#19
+ bic x19,x22,x20
+ ror x7,x24,#28
+ add x23,x23,x15 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x6,x6,x1,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x7,x7,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x5,x5,x14,ror#61
+ eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x24,ror#39 // Sigma0(a)
+ eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
+ add x0,x0,x9
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x0,x0,x6
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x0,x0,x5
+ ldr x5,[sp,#16]
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x7,x2,#1
+ and x17,x20,x27
+ ror x6,x15,#19
+ bic x28,x21,x27
+ ror x8,x23,#28
+ add x22,x22,x0 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x7,x7,x2,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x8,x8,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x6,x6,x15,ror#61
+ eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x23,ror#39 // Sigma0(a)
+ eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
+ add x1,x1,x10
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x1,x1,x7
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x1,x1,x6
+ ldr x6,[sp,#24]
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x8,x3,#1
+ and x17,x27,x26
+ ror x7,x0,#19
+ bic x19,x20,x26
+ ror x9,x22,#28
+ add x21,x21,x1 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x8,x8,x3,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x9,x9,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x7,x7,x0,ror#61
+ eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x22,ror#39 // Sigma0(a)
+ eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
+ add x2,x2,x11
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x2,x2,x8
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x2,x2,x7
+ ldr x7,[sp,#0]
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+ cbnz x19,Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#648 // rewind
+
+ ldp x3,x4,[x0]
+ ldp x5,x6,[x0,#2*8]
+ add x1,x1,#14*8 // advance input pointer
+ ldp x7,x8,[x0,#4*8]
+ add x20,x20,x3
+ ldp x9,x10,[x0,#6*8]
+ add x21,x21,x4
+ add x22,x22,x5
+ add x23,x23,x6
+ stp x20,x21,[x0]
+ add x24,x24,x7
+ add x25,x25,x8
+ stp x22,x23,[x0,#2*8]
+ add x26,x26,x9
+ add x27,x27,x10
+ cmp x1,x2
+ stp x24,x25,[x0,#4*8]
+ stp x26,x27,[x0,#6*8]
+ b.ne Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*8
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.section __TEXT,__const
+.align 6
+
+LK512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0 // terminator
+
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+.globl _sha512_block_data_order_hw
+.private_extern _sha512_block_data_order_hw
+
+.align 6
+_sha512_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
+ adrp x3,LK512@PAGE
+ add x3,x3,LK512@PAGEOFF
+
+ rev64 v16.16b,v16.16b
+ rev64 v17.16b,v17.16b
+ rev64 v18.16b,v18.16b
+ rev64 v19.16b,v19.16b
+ rev64 v20.16b,v20.16b
+ rev64 v21.16b,v21.16b
+ rev64 v22.16b,v22.16b
+ rev64 v23.16b,v23.16b
+ b Loop_hw
+
+.align 4
+Loop_hw:
+ ld1 {v24.2d},[x3],#16
+ subs x2,x2,#1
+ sub x4,x1,#128
+ orr v26.16b,v0.16b,v0.16b // offload
+ orr v27.16b,v1.16b,v1.16b
+ orr v28.16b,v2.16b,v2.16b
+ orr v29.16b,v3.16b,v3.16b
+ csel x1,x1,x4,ne // conditional rewind
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v16.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v16.16b,v16.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v17.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v17.16b,v17.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v18.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v18.16b,v18.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v19.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ rev64 v19.16b,v19.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v20.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ rev64 v20.16b,v20.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v21.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v21.16b,v21.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v22.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v22.16b,v22.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ sub x3,x3,#80*8 // rewind
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v23.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v23.16b,v23.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v0.2d,v0.2d,v26.2d // accumulate
+ add v1.2d,v1.2d,v27.2d
+ add v2.2d,v2.2d,v28.2d
+ add v3.2d,v3.2d,v29.2d
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
+
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/sha512-armv8-linux.S b/gen/bcm/sha512-armv8-linux.S
new file mode 100644
index 0000000..fd15987
--- /dev/null
+++ b/gen/bcm/sha512-armv8-linux.S
@@ -0,0 +1,1596 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl sha512_block_data_order_nohw
+.hidden sha512_block_data_order_nohw
+.type sha512_block_data_order_nohw,%function
+.align 6
+sha512_block_data_order_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*8
+
+ ldp x20,x21,[x0] // load context
+ ldp x22,x23,[x0,#2*8]
+ ldp x24,x25,[x0,#4*8]
+ add x2,x1,x2,lsl#7 // end of input
+ ldp x26,x27,[x0,#6*8]
+ adrp x30,.LK512
+ add x30,x30,:lo12:.LK512
+ stp x0,x2,[x29,#96]
+
+.Loop:
+ ldp x3,x4,[x1],#2*8
+ ldr x19,[x30],#8 // *K++
+ eor x28,x21,x22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev x3,x3 // 0
+#endif
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x6,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x3 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x4,x4 // 1
+#endif
+ ldp x5,x6,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x7,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x4 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x5,x5 // 2
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x8,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x5 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x6,x6 // 3
+#endif
+ ldp x7,x8,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x9,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x6 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x7,x7 // 4
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x10,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x7 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x10,ror#18 // Sigma1(e)
+ ror x10,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x10,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x8,x8 // 5
+#endif
+ ldp x9,x10,[x1],#2*8
+ add x23,x23,x17 // h+=Sigma0(a)
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x11,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x8 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x11,ror#18 // Sigma1(e)
+ ror x11,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x11,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x9,x9 // 6
+#endif
+ add x22,x22,x17 // h+=Sigma0(a)
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x12,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x9 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x12,ror#18 // Sigma1(e)
+ ror x12,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x12,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x10,x10 // 7
+#endif
+ ldp x11,x12,[x1],#2*8
+ add x21,x21,x17 // h+=Sigma0(a)
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ eor x13,x25,x25,ror#23
+ and x17,x26,x25
+ bic x28,x27,x25
+ add x20,x20,x10 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x13,ror#18 // Sigma1(e)
+ ror x13,x21,#28
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ eor x17,x21,x21,ror#5
+ add x20,x20,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x24,x24,x20 // d+=h
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x13,x17,ror#34 // Sigma0(a)
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x20,x20,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x11,x11 // 8
+#endif
+ add x20,x20,x17 // h+=Sigma0(a)
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x14,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x11 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x14,ror#18 // Sigma1(e)
+ ror x14,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x14,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x12,x12 // 9
+#endif
+ ldp x13,x14,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x15,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x12 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x15,ror#18 // Sigma1(e)
+ ror x15,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x15,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x13,x13 // 10
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x0,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x13 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x0,ror#18 // Sigma1(e)
+ ror x0,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x0,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x14,x14 // 11
+#endif
+ ldp x15,x0,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x6,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x14 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x15,x15 // 12
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x7,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x15 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x0,x0 // 13
+#endif
+ ldp x1,x2,[x1]
+ add x23,x23,x17 // h+=Sigma0(a)
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x8,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x0 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x1,x1 // 14
+#endif
+ ldr x6,[sp,#24]
+ add x22,x22,x17 // h+=Sigma0(a)
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x9,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x1 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x2,x2 // 15
+#endif
+ ldr x7,[sp,#0]
+ add x21,x21,x17 // h+=Sigma0(a)
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+.Loop_16_xx:
+ ldr x8,[sp,#8]
+ str x11,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x10,x5,#1
+ and x17,x25,x24
+ ror x9,x2,#19
+ bic x19,x26,x24
+ ror x11,x20,#28
+ add x27,x27,x3 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x10,x10,x5,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x11,x11,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x9,x9,x2,ror#61
+ eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x11,x20,ror#39 // Sigma0(a)
+ eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
+ add x4,x4,x13
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x4,x4,x10
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x4,x4,x9
+ ldr x9,[sp,#16]
+ str x12,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x11,x6,#1
+ and x17,x24,x23
+ ror x10,x3,#19
+ bic x28,x25,x23
+ ror x12,x27,#28
+ add x26,x26,x4 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x11,x11,x6,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x12,x12,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x10,x10,x3,ror#61
+ eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x12,x27,ror#39 // Sigma0(a)
+ eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
+ add x5,x5,x14
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x5,x5,x11
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x5,x5,x10
+ ldr x10,[sp,#24]
+ str x13,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x12,x7,#1
+ and x17,x23,x22
+ ror x11,x4,#19
+ bic x19,x24,x22
+ ror x13,x26,#28
+ add x25,x25,x5 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x12,x12,x7,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x13,x13,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x11,x11,x4,ror#61
+ eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x13,x26,ror#39 // Sigma0(a)
+ eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
+ add x6,x6,x15
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x6,x6,x12
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x6,x6,x11
+ ldr x11,[sp,#0]
+ str x14,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x13,x8,#1
+ and x17,x22,x21
+ ror x12,x5,#19
+ bic x28,x23,x21
+ ror x14,x25,#28
+ add x24,x24,x6 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x13,x13,x8,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x14,x14,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x12,x12,x5,ror#61
+ eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x14,x25,ror#39 // Sigma0(a)
+ eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
+ add x7,x7,x0
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x7,x7,x13
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x7,x7,x12
+ ldr x12,[sp,#8]
+ str x15,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x14,x9,#1
+ and x17,x21,x20
+ ror x13,x6,#19
+ bic x19,x22,x20
+ ror x15,x24,#28
+ add x23,x23,x7 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x14,x14,x9,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x15,x15,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x13,x13,x6,ror#61
+ eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x15,x24,ror#39 // Sigma0(a)
+ eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
+ add x8,x8,x1
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x8,x8,x14
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x8,x8,x13
+ ldr x13,[sp,#16]
+ str x0,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x15,x10,#1
+ and x17,x20,x27
+ ror x14,x7,#19
+ bic x28,x21,x27
+ ror x0,x23,#28
+ add x22,x22,x8 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x15,x15,x10,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x0,x0,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x14,x14,x7,ror#61
+ eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x0,x23,ror#39 // Sigma0(a)
+ eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
+ add x9,x9,x2
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x9,x9,x15
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x9,x9,x14
+ ldr x14,[sp,#24]
+ str x1,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x0,x11,#1
+ and x17,x27,x26
+ ror x15,x8,#19
+ bic x19,x20,x26
+ ror x1,x22,#28
+ add x21,x21,x9 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x0,x0,x11,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x1,x1,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x15,x15,x8,ror#61
+ eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x1,x22,ror#39 // Sigma0(a)
+ eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
+ add x10,x10,x3
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x10,x10,x0
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x10,x10,x15
+ ldr x15,[sp,#0]
+ str x2,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x1,x12,#1
+ and x17,x26,x25
+ ror x0,x9,#19
+ bic x28,x27,x25
+ ror x2,x21,#28
+ add x20,x20,x10 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x1,x1,x12,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x2,x2,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x0,x0,x9,ror#61
+ eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x2,x21,ror#39 // Sigma0(a)
+ eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
+ add x11,x11,x4
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x11,x11,x1
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x11,x11,x0
+ ldr x0,[sp,#8]
+ str x3,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x2,x13,#1
+ and x17,x25,x24
+ ror x1,x10,#19
+ bic x19,x26,x24
+ ror x3,x20,#28
+ add x27,x27,x11 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x2,x2,x13,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x3,x3,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x1,x1,x10,ror#61
+ eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x3,x20,ror#39 // Sigma0(a)
+ eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
+ add x12,x12,x5
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x12,x12,x2
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x12,x12,x1
+ ldr x1,[sp,#16]
+ str x4,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x3,x14,#1
+ and x17,x24,x23
+ ror x2,x11,#19
+ bic x28,x25,x23
+ ror x4,x27,#28
+ add x26,x26,x12 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x3,x3,x14,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x4,x4,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x2,x2,x11,ror#61
+ eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x4,x27,ror#39 // Sigma0(a)
+ eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
+ add x13,x13,x6
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x13,x13,x3
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x13,x13,x2
+ ldr x2,[sp,#24]
+ str x5,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x4,x15,#1
+ and x17,x23,x22
+ ror x3,x12,#19
+ bic x19,x24,x22
+ ror x5,x26,#28
+ add x25,x25,x13 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x4,x4,x15,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x5,x5,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x3,x3,x12,ror#61
+ eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x5,x26,ror#39 // Sigma0(a)
+ eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
+ add x14,x14,x7
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x14,x14,x4
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x14,x14,x3
+ ldr x3,[sp,#0]
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x5,x0,#1
+ and x17,x22,x21
+ ror x4,x13,#19
+ bic x28,x23,x21
+ ror x6,x25,#28
+ add x24,x24,x14 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x5,x5,x0,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x6,x6,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x4,x4,x13,ror#61
+ eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x25,ror#39 // Sigma0(a)
+ eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
+ add x15,x15,x8
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x15,x15,x5
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x15,x15,x4
+ ldr x4,[sp,#8]
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x6,x1,#1
+ and x17,x21,x20
+ ror x5,x14,#19
+ bic x19,x22,x20
+ ror x7,x24,#28
+ add x23,x23,x15 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x6,x6,x1,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x7,x7,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x5,x5,x14,ror#61
+ eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x24,ror#39 // Sigma0(a)
+ eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
+ add x0,x0,x9
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x0,x0,x6
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x0,x0,x5
+ ldr x5,[sp,#16]
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x7,x2,#1
+ and x17,x20,x27
+ ror x6,x15,#19
+ bic x28,x21,x27
+ ror x8,x23,#28
+ add x22,x22,x0 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x7,x7,x2,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x8,x8,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x6,x6,x15,ror#61
+ eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x23,ror#39 // Sigma0(a)
+ eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
+ add x1,x1,x10
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x1,x1,x7
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x1,x1,x6
+ ldr x6,[sp,#24]
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x8,x3,#1
+ and x17,x27,x26
+ ror x7,x0,#19
+ bic x19,x20,x26
+ ror x9,x22,#28
+ add x21,x21,x1 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x8,x8,x3,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x9,x9,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x7,x7,x0,ror#61
+ eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x22,ror#39 // Sigma0(a)
+ eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
+ add x2,x2,x11
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x2,x2,x8
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x2,x2,x7
+ ldr x7,[sp,#0]
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+ cbnz x19,.Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#648 // rewind
+
+ ldp x3,x4,[x0]
+ ldp x5,x6,[x0,#2*8]
+ add x1,x1,#14*8 // advance input pointer
+ ldp x7,x8,[x0,#4*8]
+ add x20,x20,x3
+ ldp x9,x10,[x0,#6*8]
+ add x21,x21,x4
+ add x22,x22,x5
+ add x23,x23,x6
+ stp x20,x21,[x0]
+ add x24,x24,x7
+ add x25,x25,x8
+ stp x22,x23,[x0,#2*8]
+ add x26,x26,x9
+ add x27,x27,x10
+ cmp x1,x2
+ stp x24,x25,[x0,#4*8]
+ stp x26,x27,[x0,#6*8]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*8
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+
+.section .rodata
+.align 6
+.type .LK512,%object
+.LK512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0 // terminator
+.size .LK512,.-.LK512
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+.globl sha512_block_data_order_hw
+.hidden sha512_block_data_order_hw
+.type sha512_block_data_order_hw,%function
+.align 6
+sha512_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
+ adrp x3,.LK512
+ add x3,x3,:lo12:.LK512
+
+ rev64 v16.16b,v16.16b
+ rev64 v17.16b,v17.16b
+ rev64 v18.16b,v18.16b
+ rev64 v19.16b,v19.16b
+ rev64 v20.16b,v20.16b
+ rev64 v21.16b,v21.16b
+ rev64 v22.16b,v22.16b
+ rev64 v23.16b,v23.16b
+ b .Loop_hw
+
+.align 4
+.Loop_hw:
+ ld1 {v24.2d},[x3],#16
+ subs x2,x2,#1
+ sub x4,x1,#128
+ orr v26.16b,v0.16b,v0.16b // offload
+ orr v27.16b,v1.16b,v1.16b
+ orr v28.16b,v2.16b,v2.16b
+ orr v29.16b,v3.16b,v3.16b
+ csel x1,x1,x4,ne // conditional rewind
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v16.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v16.16b,v16.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v17.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v17.16b,v17.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v18.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v18.16b,v18.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v19.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ rev64 v19.16b,v19.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v20.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ rev64 v20.16b,v20.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v21.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v21.16b,v21.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v22.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v22.16b,v22.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ sub x3,x3,#80*8 // rewind
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v23.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v23.16b,v23.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v0.2d,v0.2d,v26.2d // accumulate
+ add v1.2d,v1.2d,v27.2d
+ add v2.2d,v2.2d,v28.2d
+ add v3.2d,v3.2d,v29.2d
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
+
+ ldr x29,[sp],#16
+ ret
+.size sha512_block_data_order_hw,.-sha512_block_data_order_hw
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/sha512-armv8-win.S b/gen/bcm/sha512-armv8-win.S
new file mode 100644
index 0000000..220f489
--- /dev/null
+++ b/gen/bcm/sha512-armv8-win.S
@@ -0,0 +1,1600 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl sha512_block_data_order_nohw
+
+.def sha512_block_data_order_nohw
+ .type 32
+.endef
+.align 6
+sha512_block_data_order_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*8
+
+ ldp x20,x21,[x0] // load context
+ ldp x22,x23,[x0,#2*8]
+ ldp x24,x25,[x0,#4*8]
+ add x2,x1,x2,lsl#7 // end of input
+ ldp x26,x27,[x0,#6*8]
+ adrp x30,LK512
+ add x30,x30,:lo12:LK512
+ stp x0,x2,[x29,#96]
+
+Loop:
+ ldp x3,x4,[x1],#2*8
+ ldr x19,[x30],#8 // *K++
+ eor x28,x21,x22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev x3,x3 // 0
+#endif
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x6,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x3 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x4,x4 // 1
+#endif
+ ldp x5,x6,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x7,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x4 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x5,x5 // 2
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x8,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x5 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x6,x6 // 3
+#endif
+ ldp x7,x8,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x9,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x6 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x7,x7 // 4
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x10,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x7 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x10,ror#18 // Sigma1(e)
+ ror x10,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x10,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x8,x8 // 5
+#endif
+ ldp x9,x10,[x1],#2*8
+ add x23,x23,x17 // h+=Sigma0(a)
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x11,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x8 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x11,ror#18 // Sigma1(e)
+ ror x11,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x11,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x9,x9 // 6
+#endif
+ add x22,x22,x17 // h+=Sigma0(a)
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x12,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x9 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x12,ror#18 // Sigma1(e)
+ ror x12,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x12,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x10,x10 // 7
+#endif
+ ldp x11,x12,[x1],#2*8
+ add x21,x21,x17 // h+=Sigma0(a)
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ eor x13,x25,x25,ror#23
+ and x17,x26,x25
+ bic x28,x27,x25
+ add x20,x20,x10 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x13,ror#18 // Sigma1(e)
+ ror x13,x21,#28
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ eor x17,x21,x21,ror#5
+ add x20,x20,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x24,x24,x20 // d+=h
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x13,x17,ror#34 // Sigma0(a)
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x20,x20,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x11,x11 // 8
+#endif
+ add x20,x20,x17 // h+=Sigma0(a)
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x14,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x11 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x14,ror#18 // Sigma1(e)
+ ror x14,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x14,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x12,x12 // 9
+#endif
+ ldp x13,x14,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x15,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x12 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x15,ror#18 // Sigma1(e)
+ ror x15,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x15,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x13,x13 // 10
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x0,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x13 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x0,ror#18 // Sigma1(e)
+ ror x0,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x0,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x14,x14 // 11
+#endif
+ ldp x15,x0,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x6,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x14 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x15,x15 // 12
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x7,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x15 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x0,x0 // 13
+#endif
+ ldp x1,x2,[x1]
+ add x23,x23,x17 // h+=Sigma0(a)
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x8,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x0 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x1,x1 // 14
+#endif
+ ldr x6,[sp,#24]
+ add x22,x22,x17 // h+=Sigma0(a)
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x9,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x1 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x2,x2 // 15
+#endif
+ ldr x7,[sp,#0]
+ add x21,x21,x17 // h+=Sigma0(a)
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+Loop_16_xx:
+ ldr x8,[sp,#8]
+ str x11,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x10,x5,#1
+ and x17,x25,x24
+ ror x9,x2,#19
+ bic x19,x26,x24
+ ror x11,x20,#28
+ add x27,x27,x3 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x10,x10,x5,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x11,x11,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x9,x9,x2,ror#61
+ eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x11,x20,ror#39 // Sigma0(a)
+ eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
+ add x4,x4,x13
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x4,x4,x10
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x4,x4,x9
+ ldr x9,[sp,#16]
+ str x12,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x11,x6,#1
+ and x17,x24,x23
+ ror x10,x3,#19
+ bic x28,x25,x23
+ ror x12,x27,#28
+ add x26,x26,x4 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x11,x11,x6,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x12,x12,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x10,x10,x3,ror#61
+ eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x12,x27,ror#39 // Sigma0(a)
+ eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
+ add x5,x5,x14
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x5,x5,x11
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x5,x5,x10
+ ldr x10,[sp,#24]
+ str x13,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x12,x7,#1
+ and x17,x23,x22
+ ror x11,x4,#19
+ bic x19,x24,x22
+ ror x13,x26,#28
+ add x25,x25,x5 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x12,x12,x7,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x13,x13,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x11,x11,x4,ror#61
+ eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x13,x26,ror#39 // Sigma0(a)
+ eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
+ add x6,x6,x15
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x6,x6,x12
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x6,x6,x11
+ ldr x11,[sp,#0]
+ str x14,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x13,x8,#1
+ and x17,x22,x21
+ ror x12,x5,#19
+ bic x28,x23,x21
+ ror x14,x25,#28
+ add x24,x24,x6 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x13,x13,x8,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x14,x14,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x12,x12,x5,ror#61
+ eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x14,x25,ror#39 // Sigma0(a)
+ eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
+ add x7,x7,x0
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x7,x7,x13
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x7,x7,x12
+ ldr x12,[sp,#8]
+ str x15,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x14,x9,#1
+ and x17,x21,x20
+ ror x13,x6,#19
+ bic x19,x22,x20
+ ror x15,x24,#28
+ add x23,x23,x7 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x14,x14,x9,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x15,x15,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x13,x13,x6,ror#61
+ eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x15,x24,ror#39 // Sigma0(a)
+ eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
+ add x8,x8,x1
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x8,x8,x14
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x8,x8,x13
+ ldr x13,[sp,#16]
+ str x0,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x15,x10,#1
+ and x17,x20,x27
+ ror x14,x7,#19
+ bic x28,x21,x27
+ ror x0,x23,#28
+ add x22,x22,x8 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x15,x15,x10,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x0,x0,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x14,x14,x7,ror#61
+ eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x0,x23,ror#39 // Sigma0(a)
+ eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
+ add x9,x9,x2
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x9,x9,x15
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x9,x9,x14
+ ldr x14,[sp,#24]
+ str x1,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x0,x11,#1
+ and x17,x27,x26
+ ror x15,x8,#19
+ bic x19,x20,x26
+ ror x1,x22,#28
+ add x21,x21,x9 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x0,x0,x11,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x1,x1,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x15,x15,x8,ror#61
+ eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x1,x22,ror#39 // Sigma0(a)
+ eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
+ add x10,x10,x3
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x10,x10,x0
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x10,x10,x15
+ ldr x15,[sp,#0]
+ str x2,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x1,x12,#1
+ and x17,x26,x25
+ ror x0,x9,#19
+ bic x28,x27,x25
+ ror x2,x21,#28
+ add x20,x20,x10 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x1,x1,x12,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x2,x2,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x0,x0,x9,ror#61
+ eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x2,x21,ror#39 // Sigma0(a)
+ eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
+ add x11,x11,x4
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x11,x11,x1
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x11,x11,x0
+ ldr x0,[sp,#8]
+ str x3,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x2,x13,#1
+ and x17,x25,x24
+ ror x1,x10,#19
+ bic x19,x26,x24
+ ror x3,x20,#28
+ add x27,x27,x11 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x2,x2,x13,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x3,x3,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x1,x1,x10,ror#61
+ eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x3,x20,ror#39 // Sigma0(a)
+ eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
+ add x12,x12,x5
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x12,x12,x2
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x12,x12,x1
+ ldr x1,[sp,#16]
+ str x4,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x3,x14,#1
+ and x17,x24,x23
+ ror x2,x11,#19
+ bic x28,x25,x23
+ ror x4,x27,#28
+ add x26,x26,x12 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x3,x3,x14,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x4,x4,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x2,x2,x11,ror#61
+ eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x4,x27,ror#39 // Sigma0(a)
+ eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
+ add x13,x13,x6
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x13,x13,x3
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x13,x13,x2
+ ldr x2,[sp,#24]
+ str x5,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x4,x15,#1
+ and x17,x23,x22
+ ror x3,x12,#19
+ bic x19,x24,x22
+ ror x5,x26,#28
+ add x25,x25,x13 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x4,x4,x15,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x5,x5,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x3,x3,x12,ror#61
+ eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x5,x26,ror#39 // Sigma0(a)
+ eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
+ add x14,x14,x7
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x14,x14,x4
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x14,x14,x3
+ ldr x3,[sp,#0]
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x5,x0,#1
+ and x17,x22,x21
+ ror x4,x13,#19
+ bic x28,x23,x21
+ ror x6,x25,#28
+ add x24,x24,x14 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x5,x5,x0,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x6,x6,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x4,x4,x13,ror#61
+ eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x25,ror#39 // Sigma0(a)
+ eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
+ add x15,x15,x8
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x15,x15,x5
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x15,x15,x4
+ ldr x4,[sp,#8]
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x6,x1,#1
+ and x17,x21,x20
+ ror x5,x14,#19
+ bic x19,x22,x20
+ ror x7,x24,#28
+ add x23,x23,x15 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x6,x6,x1,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x7,x7,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x5,x5,x14,ror#61
+ eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x24,ror#39 // Sigma0(a)
+ eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
+ add x0,x0,x9
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x0,x0,x6
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x0,x0,x5
+ ldr x5,[sp,#16]
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x7,x2,#1
+ and x17,x20,x27
+ ror x6,x15,#19
+ bic x28,x21,x27
+ ror x8,x23,#28
+ add x22,x22,x0 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x7,x7,x2,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x8,x8,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x6,x6,x15,ror#61
+ eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x23,ror#39 // Sigma0(a)
+ eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
+ add x1,x1,x10
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x1,x1,x7
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x1,x1,x6
+ ldr x6,[sp,#24]
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x8,x3,#1
+ and x17,x27,x26
+ ror x7,x0,#19
+ bic x19,x20,x26
+ ror x9,x22,#28
+ add x21,x21,x1 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x8,x8,x3,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x9,x9,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x7,x7,x0,ror#61
+ eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x22,ror#39 // Sigma0(a)
+ eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
+ add x2,x2,x11
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x2,x2,x8
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x2,x2,x7
+ ldr x7,[sp,#0]
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+ cbnz x19,Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#648 // rewind
+
+ ldp x3,x4,[x0]
+ ldp x5,x6,[x0,#2*8]
+ add x1,x1,#14*8 // advance input pointer
+ ldp x7,x8,[x0,#4*8]
+ add x20,x20,x3
+ ldp x9,x10,[x0,#6*8]
+ add x21,x21,x4
+ add x22,x22,x5
+ add x23,x23,x6
+ stp x20,x21,[x0]
+ add x24,x24,x7
+ add x25,x25,x8
+ stp x22,x23,[x0,#2*8]
+ add x26,x26,x9
+ add x27,x27,x10
+ cmp x1,x2
+ stp x24,x25,[x0,#4*8]
+ stp x26,x27,[x0,#6*8]
+ b.ne Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*8
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.section .rodata
+.align 6
+
+LK512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0 // terminator
+
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+.globl sha512_block_data_order_hw
+
+.def sha512_block_data_order_hw
+ .type 32
+.endef
+.align 6
+sha512_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
+ adrp x3,LK512
+ add x3,x3,:lo12:LK512
+
+ rev64 v16.16b,v16.16b
+ rev64 v17.16b,v17.16b
+ rev64 v18.16b,v18.16b
+ rev64 v19.16b,v19.16b
+ rev64 v20.16b,v20.16b
+ rev64 v21.16b,v21.16b
+ rev64 v22.16b,v22.16b
+ rev64 v23.16b,v23.16b
+ b Loop_hw
+
+.align 4
+Loop_hw:
+ ld1 {v24.2d},[x3],#16
+ subs x2,x2,#1
+ sub x4,x1,#128
+ orr v26.16b,v0.16b,v0.16b // offload
+ orr v27.16b,v1.16b,v1.16b
+ orr v28.16b,v2.16b,v2.16b
+ orr v29.16b,v3.16b,v3.16b
+ csel x1,x1,x4,ne // conditional rewind
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v16.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v16.16b,v16.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v17.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v17.16b,v17.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v18.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v18.16b,v18.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v19.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ rev64 v19.16b,v19.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v20.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ rev64 v20.16b,v20.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v21.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v21.16b,v21.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v22.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v22.16b,v22.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ sub x3,x3,#80*8 // rewind
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v23.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v23.16b,v23.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v0.2d,v0.2d,v26.2d // accumulate
+ add v1.2d,v1.2d,v27.2d
+ add v2.2d,v2.2d,v28.2d
+ add v3.2d,v3.2d,v29.2d
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
+
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/sha512-x86_64-apple.S b/gen/bcm/sha512-x86_64-apple.S
new file mode 100644
index 0000000..58f27a4
--- /dev/null
+++ b/gen/bcm/sha512-x86_64-apple.S
@@ -0,0 +1,2978 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+.globl _sha512_block_data_order_nohw
+.private_extern _sha512_block_data_order_nohw
+
+.p2align 4
+_sha512_block_data_order_nohw:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ shlq $4,%rdx
+ subq $128+32,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+
+L$prologue:
+
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp L$loop
+
+.p2align 4
+L$loop:
+ movq %rbx,%rdi
+ leaq K512(%rip),%rbp
+ xorq %rcx,%rdi
+ movq 0(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 8(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 16(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 24(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 32(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 40(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 48(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 56(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rax
+ movq 64(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 72(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 80(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 88(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 96(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 104(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 112(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 120(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ jmp L$rounds_16_xx
+.p2align 4
+L$rounds_16_xx:
+ movq 8(%rsp),%r13
+ movq 112(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 72(%rsp),%r12
+
+ addq 0(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 16(%rsp),%r13
+ movq 120(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 80(%rsp),%r12
+
+ addq 8(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 24(%rsp),%r13
+ movq 0(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 88(%rsp),%r12
+
+ addq 16(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 32(%rsp),%r13
+ movq 8(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 96(%rsp),%r12
+
+ addq 24(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 40(%rsp),%r13
+ movq 16(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 104(%rsp),%r12
+
+ addq 32(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 48(%rsp),%r13
+ movq 24(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 112(%rsp),%r12
+
+ addq 40(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 56(%rsp),%r13
+ movq 32(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 120(%rsp),%r12
+
+ addq 48(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 64(%rsp),%r13
+ movq 40(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 0(%rsp),%r12
+
+ addq 56(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ movq 72(%rsp),%r13
+ movq 48(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 8(%rsp),%r12
+
+ addq 64(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 80(%rsp),%r13
+ movq 56(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 16(%rsp),%r12
+
+ addq 72(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 88(%rsp),%r13
+ movq 64(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 24(%rsp),%r12
+
+ addq 80(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 96(%rsp),%r13
+ movq 72(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 32(%rsp),%r12
+
+ addq 88(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 104(%rsp),%r13
+ movq 80(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 40(%rsp),%r12
+
+ addq 96(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 112(%rsp),%r13
+ movq 88(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 48(%rsp),%r12
+
+ addq 104(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 120(%rsp),%r13
+ movq 96(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 56(%rsp),%r12
+
+ addq 112(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 0(%rsp),%r13
+ movq 104(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 64(%rsp),%r12
+
+ addq 120(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ cmpb $0,7(%rbp)
+ jnz L$rounds_16_xx
+
+ movq 128+0(%rsp),%rdi
+ addq %r14,%rax
+ leaq 128(%rsi),%rsi
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb L$loop
+
+ movq 152(%rsp),%rsi
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$epilogue:
+ ret
+
+
+.section __DATA,__const
+.p2align 6
+
+K512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+.globl _sha512_block_data_order_avx
+.private_extern _sha512_block_data_order_avx
+
+.p2align 6
+_sha512_block_data_order_avx:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+
+L$prologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp L$loop_avx
+.p2align 4
+L$loop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp L$avx_00_47
+
+.p2align 4
+L$avx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne L$avx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb L$loop_avx
+
+ movq 152(%rsp),%rsi
+
+ vzeroupper
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$epilogue_avx:
+ ret
+
+
+#endif
diff --git a/gen/bcm/sha512-x86_64-linux.S b/gen/bcm/sha512-x86_64-linux.S
new file mode 100644
index 0000000..bbef943
--- /dev/null
+++ b/gen/bcm/sha512-x86_64-linux.S
@@ -0,0 +1,2978 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.globl sha512_block_data_order_nohw
+.hidden sha512_block_data_order_nohw
+.type sha512_block_data_order_nohw,@function
+.align 16
+sha512_block_data_order_nohw:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $128+32,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movq %rbx,%rdi
+ leaq K512(%rip),%rbp
+ xorq %rcx,%rdi
+ movq 0(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 8(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 16(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 24(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 32(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 40(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 48(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 56(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rax
+ movq 64(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 72(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 80(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 88(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 96(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 104(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 112(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 120(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movq 8(%rsp),%r13
+ movq 112(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 72(%rsp),%r12
+
+ addq 0(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 16(%rsp),%r13
+ movq 120(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 80(%rsp),%r12
+
+ addq 8(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 24(%rsp),%r13
+ movq 0(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 88(%rsp),%r12
+
+ addq 16(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 32(%rsp),%r13
+ movq 8(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 96(%rsp),%r12
+
+ addq 24(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 40(%rsp),%r13
+ movq 16(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 104(%rsp),%r12
+
+ addq 32(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 48(%rsp),%r13
+ movq 24(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 112(%rsp),%r12
+
+ addq 40(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 56(%rsp),%r13
+ movq 32(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 120(%rsp),%r12
+
+ addq 48(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 64(%rsp),%r13
+ movq 40(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 0(%rsp),%r12
+
+ addq 56(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ movq 72(%rsp),%r13
+ movq 48(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 8(%rsp),%r12
+
+ addq 64(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 80(%rsp),%r13
+ movq 56(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 16(%rsp),%r12
+
+ addq 72(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 88(%rsp),%r13
+ movq 64(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 24(%rsp),%r12
+
+ addq 80(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 96(%rsp),%r13
+ movq 72(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 32(%rsp),%r12
+
+ addq 88(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 104(%rsp),%r13
+ movq 80(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 40(%rsp),%r12
+
+ addq 96(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 112(%rsp),%r13
+ movq 88(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 48(%rsp),%r12
+
+ addq 104(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 120(%rsp),%r13
+ movq 96(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 56(%rsp),%r12
+
+ addq 112(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 0(%rsp),%r13
+ movq 104(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 64(%rsp),%r12
+
+ addq 120(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ cmpb $0,7(%rbp)
+ jnz .Lrounds_16_xx
+
+ movq 128+0(%rsp),%rdi
+ addq %r14,%rax
+ leaq 128(%rsi),%rsi
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ ret
+.cfi_endproc
+.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+.section .rodata
+.align 64
+.type K512,@object
+K512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+.globl sha512_block_data_order_avx
+.hidden sha512_block_data_order_avx
+.type sha512_block_data_order_avx,@function
+.align 64
+sha512_block_data_order_avx:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lavx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_avx
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ ret
+.cfi_endproc
+.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
+#endif
diff --git a/gen/bcm/sha512-x86_64-win.asm b/gen/bcm/sha512-x86_64-win.asm
new file mode 100644
index 0000000..3b02e03
--- /dev/null
+++ b/gen/bcm/sha512-x86_64-win.asm
@@ -0,0 +1,3140 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+global sha512_block_data_order_nohw
+
+ALIGN 16
+sha512_block_data_order_nohw:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_nohw:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,16*8+4*8
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[152+rsp],rax
+
+$L$prologue:
+
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop
+
+ALIGN 16
+$L$loop:
+ mov rdi,rbx
+ lea rbp,[K512]
+ xor rdi,rcx
+ mov r12,QWORD[rsi]
+ mov r13,r8
+ mov r14,rax
+ bswap r12
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ add r11,r14
+ mov r12,QWORD[8+rsi]
+ mov r13,rdx
+ mov r14,r11
+ bswap r12
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[8+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ add r10,r14
+ mov r12,QWORD[16+rsi]
+ mov r13,rcx
+ mov r14,r10
+ bswap r12
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[16+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ add r9,r14
+ mov r12,QWORD[24+rsi]
+ mov r13,rbx
+ mov r14,r9
+ bswap r12
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[24+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ add r8,r14
+ mov r12,QWORD[32+rsi]
+ mov r13,rax
+ mov r14,r8
+ bswap r12
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[32+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ add rdx,r14
+ mov r12,QWORD[40+rsi]
+ mov r13,r11
+ mov r14,rdx
+ bswap r12
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[40+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ add rcx,r14
+ mov r12,QWORD[48+rsi]
+ mov r13,r10
+ mov r14,rcx
+ bswap r12
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[48+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ add rbx,r14
+ mov r12,QWORD[56+rsi]
+ mov r13,r9
+ mov r14,rbx
+ bswap r12
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[56+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ add rax,r14
+ mov r12,QWORD[64+rsi]
+ mov r13,r8
+ mov r14,rax
+ bswap r12
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[64+rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ add r11,r14
+ mov r12,QWORD[72+rsi]
+ mov r13,rdx
+ mov r14,r11
+ bswap r12
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[72+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ add r10,r14
+ mov r12,QWORD[80+rsi]
+ mov r13,rcx
+ mov r14,r10
+ bswap r12
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[80+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ add r9,r14
+ mov r12,QWORD[88+rsi]
+ mov r13,rbx
+ mov r14,r9
+ bswap r12
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[88+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ add r8,r14
+ mov r12,QWORD[96+rsi]
+ mov r13,rax
+ mov r14,r8
+ bswap r12
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[96+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ add rdx,r14
+ mov r12,QWORD[104+rsi]
+ mov r13,r11
+ mov r14,rdx
+ bswap r12
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[104+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ add rcx,r14
+ mov r12,QWORD[112+rsi]
+ mov r13,r10
+ mov r14,rcx
+ bswap r12
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[112+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ add rbx,r14
+ mov r12,QWORD[120+rsi]
+ mov r13,r9
+ mov r14,rbx
+ bswap r12
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[120+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ jmp NEAR $L$rounds_16_xx
+ALIGN 16
+$L$rounds_16_xx:
+ mov r13,QWORD[8+rsp]
+ mov r15,QWORD[112+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rax,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[72+rsp]
+
+ add r12,QWORD[rsp]
+ mov r13,r8
+ add r12,r15
+ mov r14,rax
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[16+rsp]
+ mov rdi,QWORD[120+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r11,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[80+rsp]
+
+ add r12,QWORD[8+rsp]
+ mov r13,rdx
+ add r12,rdi
+ mov r14,r11
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[8+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[24+rsp]
+ mov r15,QWORD[rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r10,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[88+rsp]
+
+ add r12,QWORD[16+rsp]
+ mov r13,rcx
+ add r12,r15
+ mov r14,r10
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[16+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[32+rsp]
+ mov rdi,QWORD[8+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r9,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[96+rsp]
+
+ add r12,QWORD[24+rsp]
+ mov r13,rbx
+ add r12,rdi
+ mov r14,r9
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[24+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[40+rsp]
+ mov r15,QWORD[16+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r8,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[104+rsp]
+
+ add r12,QWORD[32+rsp]
+ mov r13,rax
+ add r12,r15
+ mov r14,r8
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[32+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[48+rsp]
+ mov rdi,QWORD[24+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rdx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[112+rsp]
+
+ add r12,QWORD[40+rsp]
+ mov r13,r11
+ add r12,rdi
+ mov r14,rdx
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[40+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[56+rsp]
+ mov r15,QWORD[32+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rcx,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[120+rsp]
+
+ add r12,QWORD[48+rsp]
+ mov r13,r10
+ add r12,r15
+ mov r14,rcx
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[48+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[64+rsp]
+ mov rdi,QWORD[40+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rbx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[rsp]
+
+ add r12,QWORD[56+rsp]
+ mov r13,r9
+ add r12,rdi
+ mov r14,rbx
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[56+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[72+rsp]
+ mov r15,QWORD[48+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rax,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[8+rsp]
+
+ add r12,QWORD[64+rsp]
+ mov r13,r8
+ add r12,r15
+ mov r14,rax
+ ror r13,23
+ mov r15,r9
+
+ xor r13,r8
+ ror r14,5
+ xor r15,r10
+
+ mov QWORD[64+rsp],r12
+ xor r14,rax
+ and r15,r8
+
+ ror r13,4
+ add r12,r11
+ xor r15,r10
+
+ ror r14,6
+ xor r13,r8
+ add r12,r15
+
+ mov r15,rax
+ add r12,QWORD[rbp]
+ xor r14,rax
+
+ xor r15,rbx
+ ror r13,14
+ mov r11,rbx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r11,rdi
+ add rdx,r12
+ add r11,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[80+rsp]
+ mov rdi,QWORD[56+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r11,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[16+rsp]
+
+ add r12,QWORD[72+rsp]
+ mov r13,rdx
+ add r12,rdi
+ mov r14,r11
+ ror r13,23
+ mov rdi,r8
+
+ xor r13,rdx
+ ror r14,5
+ xor rdi,r9
+
+ mov QWORD[72+rsp],r12
+ xor r14,r11
+ and rdi,rdx
+
+ ror r13,4
+ add r12,r10
+ xor rdi,r9
+
+ ror r14,6
+ xor r13,rdx
+ add r12,rdi
+
+ mov rdi,r11
+ add r12,QWORD[rbp]
+ xor r14,r11
+
+ xor rdi,rax
+ ror r13,14
+ mov r10,rax
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r10,r15
+ add rcx,r12
+ add r10,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[88+rsp]
+ mov r15,QWORD[64+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r10,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[24+rsp]
+
+ add r12,QWORD[80+rsp]
+ mov r13,rcx
+ add r12,r15
+ mov r14,r10
+ ror r13,23
+ mov r15,rdx
+
+ xor r13,rcx
+ ror r14,5
+ xor r15,r8
+
+ mov QWORD[80+rsp],r12
+ xor r14,r10
+ and r15,rcx
+
+ ror r13,4
+ add r12,r9
+ xor r15,r8
+
+ ror r14,6
+ xor r13,rcx
+ add r12,r15
+
+ mov r15,r10
+ add r12,QWORD[rbp]
+ xor r14,r10
+
+ xor r15,r11
+ ror r13,14
+ mov r9,r11
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor r9,rdi
+ add rbx,r12
+ add r9,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[96+rsp]
+ mov rdi,QWORD[72+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r9,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[32+rsp]
+
+ add r12,QWORD[88+rsp]
+ mov r13,rbx
+ add r12,rdi
+ mov r14,r9
+ ror r13,23
+ mov rdi,rcx
+
+ xor r13,rbx
+ ror r14,5
+ xor rdi,rdx
+
+ mov QWORD[88+rsp],r12
+ xor r14,r9
+ and rdi,rbx
+
+ ror r13,4
+ add r12,r8
+ xor rdi,rdx
+
+ ror r14,6
+ xor r13,rbx
+ add r12,rdi
+
+ mov rdi,r9
+ add r12,QWORD[rbp]
+ xor r14,r9
+
+ xor rdi,r10
+ ror r13,14
+ mov r8,r10
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor r8,r15
+ add rax,r12
+ add r8,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[104+rsp]
+ mov r15,QWORD[80+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add r8,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[40+rsp]
+
+ add r12,QWORD[96+rsp]
+ mov r13,rax
+ add r12,r15
+ mov r14,r8
+ ror r13,23
+ mov r15,rbx
+
+ xor r13,rax
+ ror r14,5
+ xor r15,rcx
+
+ mov QWORD[96+rsp],r12
+ xor r14,r8
+ and r15,rax
+
+ ror r13,4
+ add r12,rdx
+ xor r15,rcx
+
+ ror r14,6
+ xor r13,rax
+ add r12,r15
+
+ mov r15,r8
+ add r12,QWORD[rbp]
+ xor r14,r8
+
+ xor r15,r9
+ ror r13,14
+ mov rdx,r9
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rdx,rdi
+ add r11,r12
+ add rdx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[112+rsp]
+ mov rdi,QWORD[88+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rdx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[48+rsp]
+
+ add r12,QWORD[104+rsp]
+ mov r13,r11
+ add r12,rdi
+ mov r14,rdx
+ ror r13,23
+ mov rdi,rax
+
+ xor r13,r11
+ ror r14,5
+ xor rdi,rbx
+
+ mov QWORD[104+rsp],r12
+ xor r14,rdx
+ and rdi,r11
+
+ ror r13,4
+ add r12,rcx
+ xor rdi,rbx
+
+ ror r14,6
+ xor r13,r11
+ add r12,rdi
+
+ mov rdi,rdx
+ add r12,QWORD[rbp]
+ xor r14,rdx
+
+ xor rdi,r8
+ ror r13,14
+ mov rcx,r8
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rcx,r15
+ add r10,r12
+ add rcx,r12
+
+ lea rbp,[24+rbp]
+ mov r13,QWORD[120+rsp]
+ mov r15,QWORD[96+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rcx,r14
+ mov r14,r15
+ ror r15,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor r15,r14
+ shr r14,6
+
+ ror r15,19
+ xor r12,r13
+ xor r15,r14
+ add r12,QWORD[56+rsp]
+
+ add r12,QWORD[112+rsp]
+ mov r13,r10
+ add r12,r15
+ mov r14,rcx
+ ror r13,23
+ mov r15,r11
+
+ xor r13,r10
+ ror r14,5
+ xor r15,rax
+
+ mov QWORD[112+rsp],r12
+ xor r14,rcx
+ and r15,r10
+
+ ror r13,4
+ add r12,rbx
+ xor r15,rax
+
+ ror r14,6
+ xor r13,r10
+ add r12,r15
+
+ mov r15,rcx
+ add r12,QWORD[rbp]
+ xor r14,rcx
+
+ xor r15,rdx
+ ror r13,14
+ mov rbx,rdx
+
+ and rdi,r15
+ ror r14,28
+ add r12,r13
+
+ xor rbx,rdi
+ add r9,r12
+ add rbx,r12
+
+ lea rbp,[8+rbp]
+ mov r13,QWORD[rsp]
+ mov rdi,QWORD[104+rsp]
+
+ mov r12,r13
+ ror r13,7
+ add rbx,r14
+ mov r14,rdi
+ ror rdi,42
+
+ xor r13,r12
+ shr r12,7
+ ror r13,1
+ xor rdi,r14
+ shr r14,6
+
+ ror rdi,19
+ xor r12,r13
+ xor rdi,r14
+ add r12,QWORD[64+rsp]
+
+ add r12,QWORD[120+rsp]
+ mov r13,r9
+ add r12,rdi
+ mov r14,rbx
+ ror r13,23
+ mov rdi,r10
+
+ xor r13,r9
+ ror r14,5
+ xor rdi,r11
+
+ mov QWORD[120+rsp],r12
+ xor r14,rbx
+ and rdi,r9
+
+ ror r13,4
+ add r12,rax
+ xor rdi,r11
+
+ ror r14,6
+ xor r13,r9
+ add r12,rdi
+
+ mov rdi,rbx
+ add r12,QWORD[rbp]
+ xor r14,rbx
+
+ xor rdi,rcx
+ ror r13,14
+ mov rax,rcx
+
+ and r15,rdi
+ ror r14,28
+ add r12,r13
+
+ xor rax,r15
+ add r8,r12
+ add rax,r12
+
+ lea rbp,[24+rbp]
+ cmp BYTE[7+rbp],0
+ jnz NEAR $L$rounds_16_xx
+
+ mov rdi,QWORD[((128+0))+rsp]
+ add rax,r14
+ lea rsi,[128+rsi]
+
+ add rax,QWORD[rdi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop
+
+ mov rsi,QWORD[152+rsp]
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha512_block_data_order_nohw:
+section .rdata rdata align=8
+ALIGN 64
+
+K512:
+ DQ 0x428a2f98d728ae22,0x7137449123ef65cd
+ DQ 0x428a2f98d728ae22,0x7137449123ef65cd
+ DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ DQ 0x3956c25bf348b538,0x59f111f1b605d019
+ DQ 0x3956c25bf348b538,0x59f111f1b605d019
+ DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ DQ 0xd807aa98a3030242,0x12835b0145706fbe
+ DQ 0xd807aa98a3030242,0x12835b0145706fbe
+ DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ DQ 0x9bdc06a725c71235,0xc19bf174cf692694
+ DQ 0x9bdc06a725c71235,0xc19bf174cf692694
+ DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ DQ 0x983e5152ee66dfab,0xa831c66d2db43210
+ DQ 0x983e5152ee66dfab,0xa831c66d2db43210
+ DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ DQ 0x06ca6351e003826f,0x142929670a0e6e70
+ DQ 0x06ca6351e003826f,0x142929670a0e6e70
+ DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ DQ 0x650a73548baf63de,0x766a0abb3c77b2a8
+ DQ 0x650a73548baf63de,0x766a0abb3c77b2a8
+ DQ 0x81c2c92e47edaee6,0x92722c851482353b
+ DQ 0x81c2c92e47edaee6,0x92722c851482353b
+ DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ DQ 0xc24b8b70d0f89791,0xc76c51a30654be30
+ DQ 0xc24b8b70d0f89791,0xc76c51a30654be30
+ DQ 0xd192e819d6ef5218,0xd69906245565a910
+ DQ 0xd192e819d6ef5218,0xd69906245565a910
+ DQ 0xf40e35855771202a,0x106aa07032bbd1b8
+ DQ 0xf40e35855771202a,0x106aa07032bbd1b8
+ DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ DQ 0x748f82ee5defb2fc,0x78a5636f43172f60
+ DQ 0x748f82ee5defb2fc,0x78a5636f43172f60
+ DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ DQ 0x90befffa23631e28,0xa4506cebde82bde9
+ DQ 0x90befffa23631e28,0xa4506cebde82bde9
+ DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ DQ 0xca273eceea26619c,0xd186b8c721c0c207
+ DQ 0xca273eceea26619c,0xd186b8c721c0c207
+ DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ DQ 0x113f9804bef90dae,0x1b710b35131c471b
+ DQ 0x113f9804bef90dae,0x1b710b35131c471b
+ DQ 0x28db77f523047d84,0x32caab7b40c72493
+ DQ 0x28db77f523047d84,0x32caab7b40c72493
+ DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+ DQ 0x0001020304050607,0x08090a0b0c0d0e0f
+ DQ 0x0001020304050607,0x08090a0b0c0d0e0f
+ DB 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+ DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+ DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+ DB 111,114,103,62,0
+section .text
+
+global sha512_block_data_order_avx
+
+ALIGN 64
+sha512_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ shl rdx,4
+ sub rsp,256
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[152+rsp],rax
+
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx:
+
+ vzeroupper
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop_avx
+ALIGN 16
+$L$loop_avx:
+ vmovdqa xmm11,XMMWORD[((K512+1280))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vpshufb xmm0,xmm0,xmm11
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm1,xmm1,xmm11
+ vmovdqu xmm4,XMMWORD[64+rsi]
+ vpshufb xmm2,xmm2,xmm11
+ vmovdqu xmm5,XMMWORD[80+rsi]
+ vpshufb xmm3,xmm3,xmm11
+ vmovdqu xmm6,XMMWORD[96+rsi]
+ vpshufb xmm4,xmm4,xmm11
+ vmovdqu xmm7,XMMWORD[112+rsi]
+ vpshufb xmm5,xmm5,xmm11
+ vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
+ vpshufb xmm6,xmm6,xmm11
+ vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
+ vpshufb xmm7,xmm7,xmm11
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
+ vmovdqa XMMWORD[rsp],xmm8
+ vpaddq xmm8,xmm4,XMMWORD[rbp]
+ vmovdqa XMMWORD[16+rsp],xmm9
+ vpaddq xmm9,xmm5,XMMWORD[32+rbp]
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ vmovdqa XMMWORD[48+rsp],xmm11
+ vpaddq xmm11,xmm7,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[64+rsp],xmm8
+ mov r14,rax
+ vmovdqa XMMWORD[80+rsp],xmm9
+ mov rdi,rbx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ xor rdi,rcx
+ vmovdqa XMMWORD[112+rsp],xmm11
+ mov r13,r8
+ jmp NEAR $L$avx_00_47
+
+ALIGN 16
+$L$avx_00_47:
+ add rbp,256
+ vpalignr xmm8,xmm1,xmm0,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm5,xmm4,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm0,xmm0,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm7,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm7,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm0,xmm0,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm7,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm0,xmm0,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[rsp],xmm10
+ vpalignr xmm8,xmm2,xmm1,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm6,xmm5,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm1,xmm1,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm0,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm0,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm1,xmm1,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm0,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm1,xmm1,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[16+rsp],xmm10
+ vpalignr xmm8,xmm3,xmm2,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm7,xmm6,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm2,xmm2,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm1,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm1,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm2,xmm2,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm1,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm2,xmm2,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpalignr xmm8,xmm4,xmm3,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm0,xmm7,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm3,xmm3,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm2,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm2,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm3,xmm3,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm2,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm3,xmm3,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[48+rsp],xmm10
+ vpalignr xmm8,xmm5,xmm4,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm1,xmm0,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm4,xmm4,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm3,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm3,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm4,xmm4,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm3,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm4,xmm4,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm4,XMMWORD[rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[64+rsp],xmm10
+ vpalignr xmm8,xmm6,xmm5,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm2,xmm1,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm5,xmm5,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm4,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm4,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm5,xmm5,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm4,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm5,xmm5,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm5,XMMWORD[32+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[80+rsp],xmm10
+ vpalignr xmm8,xmm7,xmm6,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm3,xmm2,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm6,xmm6,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm5,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm5,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm6,xmm6,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm5,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm6,xmm6,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ vpalignr xmm8,xmm0,xmm7,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm4,xmm3,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm7,xmm7,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm6,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm6,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm7,xmm7,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm6,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm7,xmm7,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm7,XMMWORD[96+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[112+rsp],xmm10
+ cmp BYTE[135+rbp],0
+ jne NEAR $L$avx_00_47
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ mov rdi,QWORD[((128+0))+rsp]
+ mov rax,r14
+
+ add rax,QWORD[rdi]
+ lea rsi,[128+rsi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop_avx
+
+ mov rsi,QWORD[152+rsp]
+
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rsp]
+ movaps xmm7,XMMWORD[((128+48))+rsp]
+ movaps xmm8,XMMWORD[((128+64))+rsp]
+ movaps xmm9,XMMWORD[((128+80))+rsp]
+ movaps xmm10,XMMWORD[((128+96))+rsp]
+ movaps xmm11,XMMWORD[((128+112))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_sha512_block_data_order_avx:
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+ mov rsi,rax
+ mov rax,QWORD[((128+24))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ lea rsi,[((128+32))+rsi]
+ lea rdi,[512+r8]
+ mov ecx,12
+ DD 0xa548f3fc
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_sha512_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_nohw wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_sha512_block_data_order_nohw:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_avx:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/vpaes-armv7-linux.S b/gen/bcm/vpaes-armv7-linux.S
new file mode 100644
index 0000000..6e7898a
--- /dev/null
+++ b/gen/bcm/vpaes-armv7-linux.S
@@ -0,0 +1,1225 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+.syntax unified
+
+.arch armv7-a
+.fpu neon
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+.type _vpaes_consts,%object
+.align 7 @ totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:@ mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:@ mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:@ sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+.Lk_inv:@ inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:@ input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:@ sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:@ sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:@ sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+.size _vpaes_consts,.-_vpaes_consts
+.align 6
+@@
+@@ _aes_preheat
+@@
+@@ Fills q9-q15 as specified below.
+@@
+.type _vpaes_preheat,%function
+.align 4
+_vpaes_preheat:
+ adr r10, .Lk_inv
+ vmov.i8 q9, #0x0f @ .Lk_s0F
+ vld1.64 {q10,q11}, [r10]! @ .Lk_inv
+ add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo
+ vld1.64 {q12,q13}, [r10]! @ .Lk_sb1
+ vld1.64 {q14,q15}, [r10] @ .Lk_sb2
+ bx lr
+
+@@
+@@ _aes_encrypt_core
+@@
+@@ AES-encrypt q0.
+@@
+@@ Inputs:
+@@ q0 = input
+@@ q9-q15 as in _vpaes_preheat
+@@ [r2] = scheduled keys
+@@
+@@ Output in q0
+@@ Clobbers q1-q5, r8-r11
+@@ Preserves q6-q8 so you get some local vectors
+@@
+@@
+.type _vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+ mov r9, r2
+ ldr r8, [r2,#240] @ pull rounds
+ adr r11, .Lk_ipt
+ @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ vld1.64 {q2, q3}, [r11]
+ adr r11, .Lk_mc_forward+16
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1
+ vtbl.8 d3, {q2}, d3
+ vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2
+ vtbl.8 d5, {q3}, d1
+ veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+
+ @ .Lenc_entry ends with a bnz instruction which is normally paired with
+ @ subs in .Lenc_loop.
+ tst r8, r8
+ b .Lenc_entry
+
+.align 4
+.Lenc_loop:
+ @ middle of middle round
+ add r10, r11, #0x40
+ vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ vtbl.8 d9, {q13}, d5
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ vtbl.8 d1, {q12}, d7
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ vtbl.8 d11, {q15}, d5
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ vtbl.8 d5, {q14}, d7
+ vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ vtbl.8 d7, {q0}, d3
+ veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ @ Write to q5 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ vtbl.8 d11, {q0}, d9
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ vtbl.8 d9, {q3}, d3
+ @ Here we restore the original q0/q5 usage.
+ veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ subs r8, r8, #1 @ nr--
+
+.Lenc_entry:
+ @ top of round
+ vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ vtbl.8 d11, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vtbl.8 d5, {q10}, d7
+ vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ vtbl.8 d7, {q10}, d9
+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
+ bne .Lenc_loop
+
+ @ middle of last round
+ add r10, r11, #0x80
+
+ adr r11, .Lk_sbo
+ @ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+ @ overlap table and destination registers.
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
+ vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q1}, d5
+ vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ @ Write to q2 instead of q0 below, to avoid overlapping table and
+ @ destination registers.
+ vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ vtbl.8 d5, {q0}, d7
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ @ Here we restore the original q0/q2 usage.
+ vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0
+ vtbl.8 d1, {q2}, d3
+ bx lr
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,%function
+.align 4
+vpaes_encrypt:
+ @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
+ @ alignment.
+ stmdb sp!, {r7,r8,r9,r10,r11,lr}
+ @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11}
+
+ vld1.64 {q0}, [r0]
+ bl _vpaes_preheat
+ bl _vpaes_encrypt_core
+ vst1.64 {q0}, [r1]
+
+ vldmia sp!, {d8,d9,d10,d11}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_encrypt,.-vpaes_encrypt
+
+@
+@ Decryption stuff
+@
+.type _vpaes_decrypt_consts,%object
+.align 4
+_vpaes_decrypt_consts:
+.Lk_dipt:@ decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:@ decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:@ decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:@ decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:@ decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:@ decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts
+
+@@
+@@ Decryption core
+@@
+@@ Same API as encryption core, except it clobbers q12-q15 rather than using
+@@ the values from _vpaes_preheat. q9-q11 must still be set from
+@@ _vpaes_preheat.
+@@
+.type _vpaes_decrypt_core,%function
+.align 4
+_vpaes_decrypt_core:
+ mov r9, r2
+ ldr r8, [r2,#240] @ pull rounds
+
+ @ This function performs shuffles with various constants. The x86_64
+ @ version loads them on-demand into %xmm0-%xmm5. This does not work well
+ @ for ARMv7 because those registers are shuffle destinations. The ARMv8
+ @ version preloads those constants into registers, but ARMv7 has half
+ @ the registers to work with. Instead, we load them on-demand into
+ @ q12-q15, registers normally use for preloaded constants. This is fine
+ @ because decryption doesn't use those constants. The values are
+ @ constant, so this does not interfere with potential 2x optimizations.
+ adr r7, .Lk_dipt
+
+ vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11
+ eor r11, r11, #0x30 @ xor $0x30, %r11
+ adr r10, .Lk_sr
+ and r11, r11, #0x30 @ and $0x30, %r11
+ add r11, r11, r10
+ adr r10, .Lk_mc_forward+48
+
+ vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
+ vtbl.8 d5, {q12}, d3
+ vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
+ vtbl.8 d1, {q13}, d1
+ veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+
+ @ .Ldec_entry ends with a bnz instruction which is normally paired with
+ @ subs in .Ldec_loop.
+ tst r8, r8
+ b .Ldec_entry
+
+.align 4
+.Ldec_loop:
+@
+@ Inverse mix columns
+@
+
+ @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
+ @ the function.
+ adr r10, .Lk_dsb9
+ vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ @ Load sbd* ahead of time.
+ vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+ vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ vtbl.8 d9, {q12}, d5
+ vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ vtbl.8 d3, {q13}, d7
+ veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0
+
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+ @ Load sbb* ahead of time.
+ vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt
+
+ vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ vtbl.8 d9, {q14}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ vtbl.8 d3, {q15}, d7
+ @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ @ Load sbd* ahead of time.
+ vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu
+ @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet
+
+ vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ vtbl.8 d9, {q12}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ vtbl.8 d3, {q13}, d7
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+ vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ vtbl.8 d9, {q14}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ vtbl.8 d3, {q15}, d7
+ vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ subs r8, r8, #1 @ sub $1,%rax # nr--
+
+.Ldec_entry:
+ @ top of round
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vtbl.8 d5, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vtbl.8 d5, {q10}, d7
+ vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ vtbl.8 d7, {q10}, d9
+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0
+ bne .Ldec_loop
+
+ @ middle of last round
+
+ adr r10, .Lk_dsbo
+
+ @ Write to q1 rather than q4 to avoid overlapping table and destination.
+ vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q1}, d5
+ @ Write to q2 rather than q1 to avoid overlapping table and destination.
+ vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ vtbl.8 d3, {q2}, d7
+ vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ @ Write to q1 rather than q0 so the table and destination registers
+ @ below do not overlap.
+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0
+ vtbl.8 d1, {q1}, d5
+ bx lr
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,%function
+.align 4
+vpaes_decrypt:
+ @ _vpaes_decrypt_core uses r7-r11.
+ stmdb sp!, {r7,r8,r9,r10,r11,lr}
+ @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11}
+
+ vld1.64 {q0}, [r0]
+ bl _vpaes_preheat
+ bl _vpaes_decrypt_core
+ vst1.64 {q0}, [r1]
+
+ vldmia sp!, {d8,d9,d10,d11}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_decrypt,.-vpaes_decrypt
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ @@
+@@ AES key schedule @@
+@@ @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@ Key schedule constants
+@
+.type _vpaes_key_consts,%object
+.align 4
+_vpaes_key_consts:
+.Lk_dksd:@ decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:@ decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:@ decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:@ rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:@ output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:@ deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size _vpaes_key_consts,.-_vpaes_key_consts
+
+.type _vpaes_key_preheat,%function
+.align 4
+_vpaes_key_preheat:
+ adr r11, .Lk_rcon
+ vmov.i8 q12, #0x5b @ .Lk_s63
+ adr r10, .Lk_inv @ Must be aligned to 8 mod 16.
+ vmov.i8 q9, #0x0f @ .Lk_s0F
+ vld1.64 {q10,q11}, [r10] @ .Lk_inv
+ vld1.64 {q8}, [r11] @ .Lk_rcon
+ bx lr
+.size _vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type _vpaes_schedule_core,%function
+.align 4
+_vpaes_schedule_core:
+ @ We only need to save lr, but ARM requires an 8-byte stack alignment,
+ @ so save an extra register.
+ stmdb sp!, {r3,lr}
+
+ bl _vpaes_key_preheat @ load the tables
+
+ adr r11, .Lk_ipt @ Must be aligned to 8 mod 16.
+ vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ @ input transform
+ @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+ @ overlap table and destination.
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ adr r10, .Lk_sr @ Must be aligned to 8 mod 16.
+ vmov q7, q0 @ vmovdqa %xmm0, %xmm7
+
+ add r8, r8, r10
+ tst r3, r3
+ bne .Lschedule_am_decrypting
+
+ @ encrypting, output zeroth round key after transform
+ vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx)
+ b .Lschedule_go
+
+.Lschedule_am_decrypting:
+ @ decrypting, output zeroth round key after shiftrows
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q4}, d3
+ vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx)
+ eor r8, r8, #0x30 @ xor $0x30, %r8
+
+.Lschedule_go:
+ cmp r1, #192 @ cmp $192, %esi
+ bhi .Lschedule_256
+ beq .Lschedule_192
+ @ 128: fall though
+
+@@
+@@ .schedule_128
+@@
+@@ 128-bit specific part of key schedule.
+@@
+@@ This schedule is really simple, because all its parts
+@@ are accomplished by the subroutines.
+@@
+.Lschedule_128:
+ mov r0, #10 @ mov $10, %esi
+
+.Loop_schedule_128:
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle @ write output
+ b .Loop_schedule_128
+
+@@
+@@ .aes_schedule_192
+@@
+@@ 192-bit specific part of key schedule.
+@@
+@@ The main body of this schedule is the same as the 128-bit
+@@ schedule, but with more smearing. The long, high side is
+@@ stored in q7 as before, and the short, low side is in
+@@ the high bits of q6.
+@@
+@@ This schedule is somewhat nastier, however, because each
+@@ round produces 192 bits of key material, or 1.5 round keys.
+@@ Therefore, on each cycle we do 2 rounds and produce 3 round
+@@ keys.
+@@
+.align 4
+.Lschedule_192:
+ sub r0, r0, #8
+ vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform @ input transform
+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part
+ vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov r0, #4 @ mov $4, %esi
+
+.Loop_schedule_192:
+ bl _vpaes_schedule_round
+ vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle @ save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle @ save key n+1
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle @ save key n+2
+ bl _vpaes_schedule_192_smear
+ b .Loop_schedule_192
+
+@@
+@@ .aes_schedule_256
+@@
+@@ 256-bit specific part of key schedule.
+@@
+@@ The structure here is very similar to the 128-bit
+@@ schedule, but with an additional "low side" in
+@@ q6. The low side's rounds are the same as the
+@@ high side's, except no rcon and no rotation.
+@@
+.align 4
+.Lschedule_256:
+ vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform @ input transform
+ mov r0, #7 @ mov $7, %esi
+
+.Loop_schedule_256:
+ bl _vpaes_schedule_mangle @ output low result
+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ @ high round
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ @ low round. swap xmm7 and xmm6
+ vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
+ vmov.i8 q4, #0
+ vmov q5, q7 @ vmovdqa %xmm7, %xmm5
+ vmov q7, q6 @ vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ vmov q7, q5 @ vmovdqa %xmm5, %xmm7
+
+ b .Loop_schedule_256
+
+@@
+@@ .aes_schedule_mangle_last
+@@
+@@ Mangler for last round of key schedule
+@@ Mangles q0
+@@ when encrypting, outputs out(q0) ^ 63
+@@ when decrypting, outputs unskew(q0)
+@@
+@@ Always called right before return... jumps to cleanup and exits
+@@
+.align 4
+.Lschedule_mangle_last:
+ @ schedule last round key from xmm0
+ adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ tst r3, r3
+ bne .Lschedule_mangle_last_dec
+
+ @ encrypting
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
+ adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add r2, r2, #32 @ add $32, %rdx
+ vmov q2, q0
+ vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
+ vtbl.8 d1, {q2}, d3
+
+.Lschedule_mangle_last_dec:
+ sub r2, r2, #16 @ add $-16, %rdx
+ veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform @ output transform
+ vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key
+
+ @ cleanup
+ veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
+ veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
+ veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
+ veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
+ veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
+ veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
+ veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
+ ldmia sp!, {r3,pc} @ return
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@
+@@ .aes_schedule_192_smear
+@@
+@@ Smear the short, low side in the 192-bit key schedule.
+@@
+@@ Inputs:
+@@ q7: high side, b a x y
+@@ q6: low side, d c 0 0
+@@
+@@ Outputs:
+@@ q6: b+c+d b+c 0 0
+@@ q0: b+c+d b+c b a
+@@
+.type _vpaes_schedule_192_smear,%function
+.align 4
+_vpaes_schedule_192_smear:
+ vmov.i8 q1, #0
+ vdup.32 q0, d15[1]
+ vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
+ veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ vmov q0, q6 @ vmovdqa %xmm6, %xmm0
+ vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ bx lr
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+@@
+@@ .aes_schedule_round
+@@
+@@ Runs one main round of the key schedule on q0, q7
+@@
+@@ Specifically, runs subbytes on the high dword of q0
+@@ then rotates it by one byte and xors into the low dword of
+@@ q7.
+@@
+@@ Adds rcon from low byte of q8, then rotates q8 for
+@@ next rcon.
+@@
+@@ Smears the dwords of q7 by xoring the low into the
+@@ second low, result into third, result into highest.
+@@
+@@ Returns results in q7 = q0.
+@@ Clobbers q1-q4, r11.
+@@
+.type _vpaes_schedule_round,%function
+.align 4
+_vpaes_schedule_round:
+ @ extract rcon from xmm8
+ vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
+ vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1
+ vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
+
+ @ rotate
+ vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
+ vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ @ fall through...
+
+ @ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+ @ We pin other values in _vpaes_key_preheat, so load them now.
+ adr r11, .Lk_sb1
+ vld1.64 {q14,q15}, [r11]
+
+ @ smear xmm7
+ vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
+ vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4
+
+ @ subbytes
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
+ vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vtbl.8 d5, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7
+ vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ vtbl.8 d7, {q10}, d7
+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ vtbl.8 d5, {q10}, d9
+ veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q15}, d7
+ vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ vtbl.8 d3, {q14}, d5
+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ @ add in smeared stuff
+ veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
+ veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
+ bx lr
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+@@
+@@ .aes_schedule_transform
+@@
+@@ Linear-transform q0 according to tables at [r11]
+@@
+@@ Requires that q9 = 0x0F0F... as in preheat
+@@ Output in q0
+@@ Clobbers q1, q2, q14, q15
+@@
+.type _vpaes_schedule_transform,%function
+.align 4
+_vpaes_schedule_transform:
+ vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
+ @ vmovdqa 16(%r11), %xmm1 # hi
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d3
+ vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
+ vtbl.8 d1, {q15}, d1
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+ bx lr
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+@@
+@@ .aes_schedule_mangle
+@@
+@@ Mangles q0 from (basis-transformed) standard version
+@@ to our version.
+@@
+@@ On encrypt,
+@@ xor with 0x63
+@@ multiply by circulant 0,1,1,1
+@@ apply shiftrows transform
+@@
+@@ On decrypt,
+@@ xor with 0x63
+@@ multiply by "inverse mixcolumns" circulant E,B,D,9
+@@ deskew
+@@ apply shiftrows transform
+@@
+@@
+@@ Writes out to [r2], and increments or decrements it
+@@ Keeps track of round number mod 4 in r8
+@@ Preserves q0
+@@ Clobbers q1-q5
+@@
+.type _vpaes_schedule_mangle,%function
+.align 4
+_vpaes_schedule_mangle:
+ tst r3, r3
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16.
+ vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5
+ bne .Lschedule_mangle_dec
+
+ @ encrypting
+ @ Write to q2 so we do not overlap table and destination below.
+ veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4
+ add r2, r2, #16 @ add $16, %rdx
+ vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4
+ vtbl.8 d9, {q2}, d11
+ vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1
+ vtbl.8 d3, {q4}, d11
+ vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3
+ vtbl.8 d7, {q1}, d11
+ veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
+
+ b .Lschedule_mangle_both
+.align 4
+.Lschedule_mangle_dec:
+ @ inverse mix columns
+ adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11
+ vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2
+ @ vmovdqa 0x10(%r11), %xmm3
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dksb ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2
+ @ vmovdqa 0x30(%r11), %xmm3
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dkse ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2
+ @ vmovdqa 0x50(%r11), %xmm3
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dkse ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2
+ @ vmovdqa 0x70(%r11), %xmm4
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+ vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4
+ vtbl.8 d9, {q15}, d3
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3
+
+ sub r2, r2, #16 @ add $-16, %rdx
+
+.Lschedule_mangle_both:
+ @ Write to q2 so table and destination do not overlap.
+ vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d5, {q3}, d3
+ add r8, r8, #64-16 @ add $-16, %r8
+ and r8, r8, #~(1<<6) @ and $0x30, %r8
+ vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx)
+ bx lr
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,%function
+.align 4
+vpaes_set_encrypt_key:
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ lsr r9, r1, #5 @ shr $5,%eax
+ add r9, r9, #5 @ $5,%eax
+ str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov r3, #0 @ mov $0,%ecx
+ mov r8, #0x30 @ mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor r0, r0, r0
+
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,%function
+.align 4
+vpaes_set_decrypt_key:
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ lsr r9, r1, #5 @ shr $5,%eax
+ add r9, r9, #5 @ $5,%eax
+ str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl r9, r9, #4 @ shl $4,%eax
+ add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx
+ add r2, r2, r9
+
+ mov r3, #1 @ mov $1,%ecx
+ lsr r8, r1, #1 @ shr $1,%r8d
+ and r8, r8, #32 @ and $32,%r8d
+ eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+@ Additional constants for converting to bsaes.
+.type _vpaes_convert_consts,%object
+.align 4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@ def u64s_to_u128(x, y):
+@ return x | (y << 64)
+@ def u128_to_u64s(w):
+@ return w & ((1<<64)-1), w >> 64
+@ def get_byte(w, i):
+@ return (w >> (i*8)) & 0xff
+@ def apply_table(table, b):
+@ lo = b & 0xf
+@ hi = b >> 4
+@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@ def opt(b):
+@ table = [
+@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@ ]
+@ return apply_table(table, b)
+@ def rot_byte(b, n):
+@ return 0xff & ((b << n) | (b >> (8-n)))
+@ def skew(x):
+@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@ rot_byte(x, 4))
+@ table = [0, 0]
+@ for i in range(16):
+@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@ table[1] |= skew(opt(i<<4)) << (i*8)
+@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+.quad 0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+.Lk_decrypt_transform:
+.quad 0x0704050603000102, 0x0f0c0d0e0b08090a
+.size _vpaes_convert_consts,.-_vpaes_convert_consts
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl vpaes_encrypt_key_to_bsaes
+.hidden vpaes_encrypt_key_to_bsaes
+.type vpaes_encrypt_key_to_bsaes,%function
+.align 4
+vpaes_encrypt_key_to_bsaes:
+ stmdb sp!, {r11, lr}
+
+ @ See _vpaes_schedule_core for the key schedule logic. In particular,
+ @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+ @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+ @ contain the transformations not in the bsaes representation. This
+ @ function inverts those transforms.
+ @
+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+ @ representation, which does not match the other aes_nohw_*
+ @ implementations. The ARM aes_nohw_* stores each 32-bit word
+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+ @ cost of extra REV and VREV32 operations in little-endian ARM.
+
+ vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
+ adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16.
+ add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+ vld1.64 {q12}, [r2]
+ vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64
+ adr r11, .Lk_opt @ Must be aligned to 8 mod 16.
+ vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied
+
+ @ vpaes stores one fewer round count than bsaes, but the number of keys
+ @ is the same.
+ ldr r2, [r1,#240]
+ add r2, r2, #1
+ str r2, [r0,#240]
+
+ @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+ @ Invert this with .Lk_opt.
+ vld1.64 {q0}, [r1]!
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+ @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+ @ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+ vld1.64 {q0}, [r1]!
+
+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+ @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+ @ We use r3 rather than r8 to avoid a callee-saved register.
+ vld1.64 {q1}, [r3]
+ vtbl.8 d4, {q0}, d2
+ vtbl.8 d5, {q0}, d3
+ add r3, r3, #16
+ and r3, r3, #~(1<<6)
+ vmov q0, q2
+
+ @ Handle the last key differently.
+ subs r2, r2, #1
+ beq .Loop_enc_key_to_bsaes_last
+
+ @ Multiply by the circulant. This is its own inverse.
+ vtbl.8 d2, {q0}, d24
+ vtbl.8 d3, {q0}, d25
+ vmov q0, q1
+ vtbl.8 d4, {q1}, d24
+ vtbl.8 d5, {q1}, d25
+ veor q0, q0, q2
+ vtbl.8 d2, {q2}, d24
+ vtbl.8 d3, {q2}, d25
+ veor q0, q0, q1
+
+ @ XOR and finish.
+ veor q0, q0, q10
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+ b .Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+ @ The final key does not have a basis transform (note
+ @ .Lschedule_mangle_last inverts the original transform). It only XORs
+ @ 0x63 and applies ShiftRows. The latter was already inverted in the
+ @ loop. Note that, because we act on the original representation, we use
+ @ q11, not q10.
+ veor q0, q0, q11
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]
+
+ @ Wipe registers which contained key material.
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+
+ ldmia sp!, {r11, pc} @ return
+.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
+
+@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
+.globl vpaes_decrypt_key_to_bsaes
+.hidden vpaes_decrypt_key_to_bsaes
+.type vpaes_decrypt_key_to_bsaes,%function
+.align 4
+vpaes_decrypt_key_to_bsaes:
+ stmdb sp!, {r11, lr}
+
+ @ See _vpaes_schedule_core for the key schedule logic. Note vpaes
+ @ computes the decryption key schedule in reverse. Additionally,
+ @ aes-x86_64.pl shares some transformations, so we must only partially
+ @ invert vpaes's transformations. In general, vpaes computes in a
+ @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
+ @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
+ @ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
+ @
+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+ @ representation, which does not match the other aes_nohw_*
+ @ implementations. The ARM aes_nohw_* stores each 32-bit word
+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+ @ cost of extra REV and VREV32 operations in little-endian ARM.
+
+ adr r2, .Lk_decrypt_transform
+ adr r3, .Lk_sr+0x30
+ adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform.
+ vld1.64 {q12}, [r2] @ Reuse q12 from encryption.
+ vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
+
+ @ vpaes stores one fewer round count than bsaes, but the number of keys
+ @ is the same.
+ ldr r2, [r1,#240]
+ add r2, r2, #1
+ str r2, [r0,#240]
+
+ @ Undo the basis change and reapply the S-box affine transform. See
+ @ .Lschedule_mangle_last.
+ vld1.64 {q0}, [r1]!
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ See _vpaes_schedule_mangle for the transform on the middle keys. Note
+ @ it simultaneously inverts MixColumns and the S-box affine transform.
+ @ See .Lk_dksd through .Lk_dks9.
+.Loop_dec_key_to_bsaes:
+ vld1.64 {q0}, [r1]!
+
+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
+ @ forwards cancels inverting for which direction we cycle r3. We use r3
+ @ rather than r8 to avoid a callee-saved register.
+ vld1.64 {q1}, [r3]
+ vtbl.8 d4, {q0}, d2
+ vtbl.8 d5, {q0}, d3
+ add r3, r3, #64-16
+ and r3, r3, #~(1<<6)
+ vmov q0, q2
+
+ @ Handle the last key differently.
+ subs r2, r2, #1
+ beq .Loop_dec_key_to_bsaes_last
+
+ @ Undo the basis change and reapply the S-box affine transform.
+ bl _vpaes_schedule_transform
+
+ @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
+ @ combine the two operations in .Lk_decrypt_transform.
+ @
+ @ TODO(davidben): Where does the rotation come from?
+ vtbl.8 d2, {q0}, d24
+ vtbl.8 d3, {q0}, d25
+
+ vst1.64 {q1}, [r0]!
+ b .Loop_dec_key_to_bsaes
+
+.Loop_dec_key_to_bsaes_last:
+ @ The final key only inverts ShiftRows (already done in the loop). See
+ @ .Lschedule_am_decrypting. Its basis is not transformed.
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ Wipe registers which contained key material.
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+
+ ldmia sp!, {r11, pc} @ return
+.size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes
+.globl vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type vpaes_ctr32_encrypt_blocks,%function
+.align 4
+vpaes_ctr32_encrypt_blocks:
+ mov ip, sp
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ @ This function uses q4-q7 (d8-d15), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ cmp r2, #0
+ @ r8 is passed on the stack.
+ ldr r8, [ip]
+ beq .Lctr32_done
+
+ @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
+ mov r9, r3
+ mov r3, r2
+ mov r2, r9
+
+ @ Load the IV and counter portion.
+ ldr r7, [r8, #12]
+ vld1.8 {q7}, [r8]
+
+ bl _vpaes_preheat
+ rev r7, r7 @ The counter is big-endian.
+
+.Lctr32_loop:
+ vmov q0, q7
+ vld1.8 {q6}, [r0]! @ .Load input ahead of time
+ bl _vpaes_encrypt_core
+ veor q0, q0, q6 @ XOR input and result
+ vst1.8 {q0}, [r1]!
+ subs r3, r3, #1
+ @ Update the counter.
+ add r7, r7, #1
+ rev r9, r7
+ vmov.32 d15[1], r9
+ bne .Lctr32_loop
+
+.Lctr32_done:
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/bcm/vpaes-armv8-apple.S b/gen/bcm/vpaes-armv8-apple.S
new file mode 100644
index 0000000..a108a96
--- /dev/null
+++ b/gen/bcm/vpaes-armv8-apple.S
@@ -0,0 +1,1224 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.section __TEXT,__const
+
+
+.align 7 // totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward: // mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward: // mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr: // sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv: // inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt: // input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo: // sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1: // sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2: // sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+// Decryption stuff
+//
+Lk_dipt: // decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo: // decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9: // decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd: // decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb: // decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe: // decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+// Key schedule constants
+//
+Lk_dksd: // decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb: // decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse: // decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9: // decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon: // rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt: // output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew: // deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+
+.align 6
+
+.text
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+
+.align 4
+_vpaes_encrypt_preheat:
+ adrp x10, Lk_inv@PAGE
+ add x10, x10, Lk_inv@PAGEOFF
+ movi v17.16b, #0x0f
+ ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2
+ ret
+
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+
+.align 4
+_vpaes_encrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, Lk_mc_forward@PAGE+16
+ add x11, x11, Lk_mc_forward@PAGEOFF+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b Lenc_entry
+
+.align 4
+Lenc_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ sub w8, w8, #1 // nr--
+
+Lenc_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, Lenc_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ ret
+
+
+.globl _vpaes_encrypt
+.private_extern _vpaes_encrypt
+
+.align 4
+_vpaes_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_encrypt_preheat
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+
+.align 4
+_vpaes_encrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, Lk_mc_forward@PAGE+16
+ add x11, x11, Lk_mc_forward@PAGEOFF+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ tbl v9.16b, {v20.16b}, v9.16b
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ tbl v10.16b, {v21.16b}, v8.16b
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v8.16b, v9.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b Lenc_2x_entry
+
+.align 4
+Lenc_2x_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ tbl v12.16b, {v25.16b}, v10.16b
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v24.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ tbl v13.16b, {v27.16b}, v10.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ tbl v10.16b, {v26.16b}, v11.16b
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ tbl v11.16b, {v8.16b}, v1.16b
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ eor v10.16b, v10.16b, v13.16b
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ tbl v8.16b, {v8.16b}, v4.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ eor v11.16b, v11.16b, v10.16b
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ tbl v12.16b, {v11.16b},v1.16b
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ eor v8.16b, v8.16b, v11.16b
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ eor v8.16b, v8.16b, v12.16b
+ sub w8, w8, #1 // nr--
+
+Lenc_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ tbl v13.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v13.16b
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v13.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, Lenc_2x_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v23.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v1.16b
+ ret
+
+
+
+.align 4
+_vpaes_decrypt_preheat:
+ adrp x10, Lk_inv@PAGE
+ add x10, x10, Lk_inv@PAGEOFF
+ movi v17.16b, #0x0f
+ adrp x11, Lk_dipt@PAGE
+ add x11, x11, Lk_dipt@PAGEOFF
+ ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe
+ ret
+
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+
+.align 4
+_vpaes_decrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, Lk_sr@PAGE
+ add x10, x10, Lk_sr@PAGEOFF
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, Lk_mc_forward@PAGE+48
+ add x10, x10, Lk_mc_forward@PAGEOFF+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b Ldec_entry
+
+.align 4
+Ldec_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+Ldec_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, Ldec_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ ret
+
+
+.globl _vpaes_decrypt
+.private_extern _vpaes_decrypt
+
+.align 4
+_vpaes_decrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_decrypt_preheat
+ bl _vpaes_decrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// v14-v15 input, v0-v1 output
+
+.align 4
+_vpaes_decrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, Lk_sr@PAGE
+ add x10, x10, Lk_sr@PAGEOFF
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, Lk_mc_forward@PAGE+48
+ add x10, x10, Lk_mc_forward@PAGEOFF+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ tbl v10.16b, {v20.16b},v9.16b
+ ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ tbl v8.16b, {v21.16b},v8.16b
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v10.16b, v10.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b Ldec_2x_entry
+
+.align 4
+Ldec_2x_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v12.16b, {v24.16b}, v10.16b
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ tbl v9.16b, {v25.16b}, v11.16b
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ eor v8.16b, v12.16b, v16.16b
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v12.16b, {v26.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ tbl v9.16b, {v27.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v12.16b, {v28.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ tbl v9.16b, {v29.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v12.16b, {v30.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ tbl v9.16b, {v31.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+Ldec_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ tbl v10.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v10.16b
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v10.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, Ldec_2x_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ tbl v9.16b, {v23.16b}, v11.16b
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ eor v8.16b, v9.16b, v12.16b
+ tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v2.16b
+ ret
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+
+.align 4
+_vpaes_key_preheat:
+ adrp x10, Lk_inv@PAGE
+ add x10, x10, Lk_inv@PAGEOFF
+ movi v16.16b, #0x5b // Lk_s63
+ adrp x11, Lk_sb1@PAGE
+ add x11, x11, Lk_sb1@PAGEOFF
+ movi v17.16b, #0x0f // Lk_s0F
+ ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt
+ adrp x10, Lk_dksd@PAGE
+ add x10, x10, Lk_dksd@PAGEOFF
+ ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1
+ adrp x11, Lk_mc_forward@PAGE
+ add x11, x11, Lk_mc_forward@PAGEOFF
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9
+ ld1 {v8.2d}, [x10] // Lk_rcon
+ ld1 {v9.2d}, [x11] // Lk_mc_forward[0]
+ ret
+
+
+
+.align 4
+_vpaes_schedule_core:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp,#-16]!
+ add x29,sp,#0
+
+ bl _vpaes_key_preheat // load the tables
+
+ ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ // input transform
+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
+
+ adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10
+ add x10, x10, Lk_sr@PAGEOFF
+
+ add x8, x8, x10
+ cbnz w3, Lschedule_am_decrypting
+
+ // encrypting, output zeroth round key after transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
+ b Lschedule_go
+
+Lschedule_am_decrypting:
+ // decrypting, output zeroth round key after shiftrows
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ eor x8, x8, #0x30 // xor $0x30, %r8
+
+Lschedule_go:
+ cmp w1, #192 // cmp $192, %esi
+ b.hi Lschedule_256
+ b.eq Lschedule_192
+ // 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+Lschedule_128:
+ mov x0, #10 // mov $10, %esi
+
+Loop_schedule_128:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // write output
+ b Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+Lschedule_192:
+ sub x0, x0, #8
+ ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov x0, #4 // mov $4, %esi
+
+Loop_schedule_192:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle // save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle // save key n+1
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // save key n+2
+ bl _vpaes_schedule_192_smear
+ b Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+Lschedule_256:
+ ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov x0, #7 // mov $7, %esi
+
+Loop_schedule_256:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_mangle // output low result
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ // high round
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ // low round. swap xmm7 and xmm6
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ movi v4.16b, #0
+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
+
+ b Loop_schedule_256
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+Lschedule_mangle_last:
+ // schedule last round key from xmm0
+ adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew
+ add x11, x11, Lk_deskew@PAGEOFF
+
+ cbnz w3, Lschedule_mangle_last_dec
+
+ // encrypting
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
+ adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform
+ add x11, x11, Lk_opt@PAGEOFF
+ add x2, x2, #32 // add $32, %rdx
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+Lschedule_mangle_last_dec:
+ ld1 {v20.2d,v21.2d}, [x11] // reload constants
+ sub x2, x2, #16 // add $-16, %rdx
+ eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform // output transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
+
+ // cleanup
+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
+ ldp x29, x30, [sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+
+.align 4
+_vpaes_schedule_192_smear:
+ movi v1.16b, #0
+ dup v0.4s, v7.s[3]
+ ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ ret
+
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+
+.align 4
+_vpaes_schedule_round:
+ // extract rcon from xmm8
+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
+ ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
+ ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+
+ // rotate
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ // fall through...
+
+ // low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ // smear xmm7
+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
+
+ // subbytes
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
+ tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ // add in smeared stuff
+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
+ ret
+
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+
+.align 4
+_vpaes_schedule_transform:
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ // vmovdqa (%r11), %xmm2 # lo
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ // vmovdqa 16(%r11), %xmm1 # hi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ ret
+
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+
+.align 4
+_vpaes_schedule_mangle:
+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ // vmovdqa .Lk_mc_forward(%rip),%xmm5
+ cbnz w3, Lschedule_mangle_dec
+
+ // encrypting
+ eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
+ add x2, x2, #16 // add $16, %rdx
+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
+
+ b Lschedule_mangle_both
+.align 4
+Lschedule_mangle_dec:
+ // inverse mix columns
+ // lea .Lk_dksd(%rip),%r11
+ ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ // vmovdqa 0x00(%r11), %xmm2
+ tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ // vmovdqa 0x10(%r11), %xmm3
+ tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x20(%r11), %xmm2
+ tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x30(%r11), %xmm3
+ tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x40(%r11), %xmm2
+ tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x50(%r11), %xmm3
+ tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+
+ // vmovdqa 0x60(%r11), %xmm2
+ tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+ // vmovdqa 0x70(%r11), %xmm4
+ tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
+
+ sub x2, x2, #16 // add $-16, %rdx
+
+Lschedule_mangle_both:
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ add x8, x8, #48 // add $-16, %r8
+ and x8, x8, #~(1<<6) // and $0x30, %r8
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ ret
+
+
+.globl _vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+
+.align 4
+_vpaes_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov w3, #0 // mov $0,%ecx
+ mov x8, #0x30 // mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor x0, x0, x0
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl _vpaes_set_decrypt_key
+.private_extern _vpaes_set_decrypt_key
+
+.align 4
+_vpaes_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl w9, w9, #4 // shl $4,%eax
+ add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
+ add x2, x2, x9
+
+ mov w3, #1 // mov $1,%ecx
+ lsr w8, w1, #1 // shr $1,%r8d
+ and x8, x8, #32 // and $32,%r8d
+ eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _vpaes_cbc_encrypt
+.private_extern _vpaes_cbc_encrypt
+
+.align 4
+_vpaes_cbc_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ cbz x2, Lcbc_abort
+ cmp w5, #0 // check direction
+ b.eq vpaes_cbc_decrypt
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+
+ ld1 {v0.16b}, [x4] // load ivec
+ bl _vpaes_encrypt_preheat
+ b Lcbc_enc_loop
+
+.align 4
+Lcbc_enc_loop:
+ ld1 {v7.16b}, [x0],#16 // load input
+ eor v7.16b, v7.16b, v0.16b // xor with ivec
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1],#16 // save output
+ subs x17, x17, #16
+ b.hi Lcbc_enc_loop
+
+ st1 {v0.16b}, [x4] // write ivec
+
+ ldp x29,x30,[sp],#16
+Lcbc_abort:
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+
+.align 4
+vpaes_cbc_decrypt:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+ // only from vpaes_cbc_encrypt which has already signed the return address.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+ ld1 {v6.16b}, [x4] // load ivec
+ bl _vpaes_decrypt_preheat
+ tst x17, #16
+ b.eq Lcbc_dec_loop2x
+
+ ld1 {v7.16b}, [x0], #16 // load input
+ bl _vpaes_decrypt_core
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ orr v6.16b, v7.16b, v7.16b // next ivec value
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #16
+ b.ls Lcbc_dec_done
+
+.align 4
+Lcbc_dec_loop2x:
+ ld1 {v14.16b,v15.16b}, [x0], #32
+ bl _vpaes_decrypt_2x
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ eor v1.16b, v1.16b, v14.16b
+ orr v6.16b, v15.16b, v15.16b
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #32
+ b.hi Lcbc_dec_loop2x
+
+Lcbc_dec_done:
+ st1 {v6.16b}, [x4]
+
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _vpaes_ctr32_encrypt_blocks
+.private_extern _vpaes_ctr32_encrypt_blocks
+
+.align 4
+_vpaes_ctr32_encrypt_blocks:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ cbz x2, Lctr32_done
+
+ // Note, unlike the other functions, x2 here is measured in blocks,
+ // not bytes.
+ mov x17, x2
+ mov x2, x3
+
+ // Load the IV and counter portion.
+ ldr w6, [x4, #12]
+ ld1 {v7.16b}, [x4]
+
+ bl _vpaes_encrypt_preheat
+ tst x17, #1
+ rev w6, w6 // The counter is big-endian.
+ b.eq Lctr32_prep_loop
+
+ // Handle one block so the remaining block count is even for
+ // _vpaes_encrypt_2x.
+ ld1 {v6.16b}, [x0], #16 // Load input ahead of time
+ bl _vpaes_encrypt_core
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #1
+ // Update the counter.
+ add w6, w6, #1
+ rev w7, w6
+ mov v7.s[3], w7
+ b.ls Lctr32_done
+
+Lctr32_prep_loop:
+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+ // uses v14 and v15.
+ mov v15.16b, v7.16b
+ mov v14.16b, v7.16b
+ add w6, w6, #1
+ rev w7, w6
+ mov v15.s[3], w7
+
+Lctr32_loop:
+ ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
+ bl _vpaes_encrypt_2x
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #2
+ // Update the counter.
+ add w7, w6, #1
+ add w6, w6, #2
+ rev w7, w7
+ mov v14.s[3], w7
+ rev w7, w6
+ mov v15.s[3], w7
+ b.hi Lctr32_loop
+
+Lctr32_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/bcm/vpaes-armv8-linux.S b/gen/bcm/vpaes-armv8-linux.S
new file mode 100644
index 0000000..c343f00
--- /dev/null
+++ b/gen/bcm/vpaes-armv8-linux.S
@@ -0,0 +1,1224 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.section .rodata
+
+.type _vpaes_consts,%object
+.align 7 // totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward: // mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward: // mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr: // sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv: // inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt: // input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo: // sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1: // sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2: // sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+// Decryption stuff
+//
+.Lk_dipt: // decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo: // decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9: // decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: // decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: // decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: // decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+// Key schedule constants
+//
+.Lk_dksd: // decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: // decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: // decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: // decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon: // rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt: // output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew: // deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+.size _vpaes_consts,.-_vpaes_consts
+.align 6
+
+.text
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.type _vpaes_encrypt_preheat,%function
+.align 4
+_vpaes_encrypt_preheat:
+ adrp x10, .Lk_inv
+ add x10, x10, :lo12:.Lk_inv
+ movi v17.16b, #0x0f
+ ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
+ ret
+.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type _vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, .Lk_mc_forward+16
+ add x11, x11, :lo12:.Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b .Lenc_entry
+
+.align 4
+.Lenc_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ sub w8, w8, #1 // nr--
+
+.Lenc_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, .Lenc_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,%function
+.align 4
+vpaes_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_encrypt_preheat
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_encrypt,.-vpaes_encrypt
+
+.type _vpaes_encrypt_2x,%function
+.align 4
+_vpaes_encrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, .Lk_mc_forward+16
+ add x11, x11, :lo12:.Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ tbl v9.16b, {v20.16b}, v9.16b
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ tbl v10.16b, {v21.16b}, v8.16b
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v8.16b, v9.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b .Lenc_2x_entry
+
+.align 4
+.Lenc_2x_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ tbl v12.16b, {v25.16b}, v10.16b
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v24.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ tbl v13.16b, {v27.16b}, v10.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ tbl v10.16b, {v26.16b}, v11.16b
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ tbl v11.16b, {v8.16b}, v1.16b
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ eor v10.16b, v10.16b, v13.16b
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ tbl v8.16b, {v8.16b}, v4.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ eor v11.16b, v11.16b, v10.16b
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ tbl v12.16b, {v11.16b},v1.16b
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ eor v8.16b, v8.16b, v11.16b
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ eor v8.16b, v8.16b, v12.16b
+ sub w8, w8, #1 // nr--
+
+.Lenc_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ tbl v13.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v13.16b
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v13.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, .Lenc_2x_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v23.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v1.16b
+ ret
+.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type _vpaes_decrypt_preheat,%function
+.align 4
+_vpaes_decrypt_preheat:
+ adrp x10, .Lk_inv
+ add x10, x10, :lo12:.Lk_inv
+ movi v17.16b, #0x0f
+ adrp x11, .Lk_dipt
+ add x11, x11, :lo12:.Lk_dipt
+ ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
+ ret
+.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.type _vpaes_decrypt_core,%function
+.align 4
+_vpaes_decrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, .Lk_sr
+ add x10, x10, :lo12:.Lk_sr
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, .Lk_mc_forward+48
+ add x10, x10, :lo12:.Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b .Ldec_entry
+
+.align 4
+.Ldec_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+.Ldec_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, .Ldec_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,%function
+.align 4
+vpaes_decrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_decrypt_preheat
+ bl _vpaes_decrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type _vpaes_decrypt_2x,%function
+.align 4
+_vpaes_decrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, .Lk_sr
+ add x10, x10, :lo12:.Lk_sr
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, .Lk_mc_forward+48
+ add x10, x10, :lo12:.Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ tbl v10.16b, {v20.16b},v9.16b
+ ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ tbl v8.16b, {v21.16b},v8.16b
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v10.16b, v10.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b .Ldec_2x_entry
+
+.align 4
+.Ldec_2x_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v12.16b, {v24.16b}, v10.16b
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ tbl v9.16b, {v25.16b}, v11.16b
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ eor v8.16b, v12.16b, v16.16b
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v12.16b, {v26.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ tbl v9.16b, {v27.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v12.16b, {v28.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ tbl v9.16b, {v29.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v12.16b, {v30.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ tbl v9.16b, {v31.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+.Ldec_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ tbl v10.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v10.16b
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v10.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, .Ldec_2x_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ tbl v9.16b, {v23.16b}, v11.16b
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ eor v8.16b, v9.16b, v12.16b
+ tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v2.16b
+ ret
+.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.type _vpaes_key_preheat,%function
+.align 4
+_vpaes_key_preheat:
+ adrp x10, .Lk_inv
+ add x10, x10, :lo12:.Lk_inv
+ movi v16.16b, #0x5b // .Lk_s63
+ adrp x11, .Lk_sb1
+ add x11, x11, :lo12:.Lk_sb1
+ movi v17.16b, #0x0f // .Lk_s0F
+ ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt
+ adrp x10, .Lk_dksd
+ add x10, x10, :lo12:.Lk_dksd
+ ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1
+ adrp x11, .Lk_mc_forward
+ add x11, x11, :lo12:.Lk_mc_forward
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
+ ld1 {v8.2d}, [x10] // .Lk_rcon
+ ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
+ ret
+.size _vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type _vpaes_schedule_core,%function
+.align 4
+_vpaes_schedule_core:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp,#-16]!
+ add x29,sp,#0
+
+ bl _vpaes_key_preheat // load the tables
+
+ ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ // input transform
+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
+
+ adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10
+ add x10, x10, :lo12:.Lk_sr
+
+ add x8, x8, x10
+ cbnz w3, .Lschedule_am_decrypting
+
+ // encrypting, output zeroth round key after transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
+ b .Lschedule_go
+
+.Lschedule_am_decrypting:
+ // decrypting, output zeroth round key after shiftrows
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ eor x8, x8, #0x30 // xor $0x30, %r8
+
+.Lschedule_go:
+ cmp w1, #192 // cmp $192, %esi
+ b.hi .Lschedule_256
+ b.eq .Lschedule_192
+ // 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ mov x0, #10 // mov $10, %esi
+
+.Loop_schedule_128:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ cbz x0, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // write output
+ b .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+.Lschedule_192:
+ sub x0, x0, #8
+ ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov x0, #4 // mov $4, %esi
+
+.Loop_schedule_192:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle // save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle // save key n+1
+ bl _vpaes_schedule_round
+ cbz x0, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // save key n+2
+ bl _vpaes_schedule_192_smear
+ b .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+.Lschedule_256:
+ ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov x0, #7 // mov $7, %esi
+
+.Loop_schedule_256:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_mangle // output low result
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ // high round
+ bl _vpaes_schedule_round
+ cbz x0, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ // low round. swap xmm7 and xmm6
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ movi v4.16b, #0
+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
+
+ b .Loop_schedule_256
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+.Lschedule_mangle_last:
+ // schedule last round key from xmm0
+ adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ add x11, x11, :lo12:.Lk_deskew
+
+ cbnz w3, .Lschedule_mangle_last_dec
+
+ // encrypting
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
+ adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add x11, x11, :lo12:.Lk_opt
+ add x2, x2, #32 // add $32, %rdx
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+.Lschedule_mangle_last_dec:
+ ld1 {v20.2d,v21.2d}, [x11] // reload constants
+ sub x2, x2, #16 // add $-16, %rdx
+ eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform // output transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
+
+ // cleanup
+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
+ ldp x29, x30, [sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.type _vpaes_schedule_192_smear,%function
+.align 4
+_vpaes_schedule_192_smear:
+ movi v1.16b, #0
+ dup v0.4s, v7.s[3]
+ ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.type _vpaes_schedule_round,%function
+.align 4
+_vpaes_schedule_round:
+ // extract rcon from xmm8
+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
+ ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
+ ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+
+ // rotate
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ // fall through...
+
+ // low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ // smear xmm7
+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
+
+ // subbytes
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
+ tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ // add in smeared stuff
+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.type _vpaes_schedule_transform,%function
+.align 4
+_vpaes_schedule_transform:
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ // vmovdqa (%r11), %xmm2 # lo
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ // vmovdqa 16(%r11), %xmm1 # hi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.type _vpaes_schedule_mangle,%function
+.align 4
+_vpaes_schedule_mangle:
+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ // vmovdqa .Lk_mc_forward(%rip),%xmm5
+ cbnz w3, .Lschedule_mangle_dec
+
+ // encrypting
+ eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
+ add x2, x2, #16 // add $16, %rdx
+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
+
+ b .Lschedule_mangle_both
+.align 4
+.Lschedule_mangle_dec:
+ // inverse mix columns
+ // lea .Lk_dksd(%rip),%r11
+ ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ // vmovdqa 0x00(%r11), %xmm2
+ tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ // vmovdqa 0x10(%r11), %xmm3
+ tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x20(%r11), %xmm2
+ tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x30(%r11), %xmm3
+ tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x40(%r11), %xmm2
+ tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x50(%r11), %xmm3
+ tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+
+ // vmovdqa 0x60(%r11), %xmm2
+ tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+ // vmovdqa 0x70(%r11), %xmm4
+ tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
+
+ sub x2, x2, #16 // add $-16, %rdx
+
+.Lschedule_mangle_both:
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ add x8, x8, #48 // add $-16, %r8
+ and x8, x8, #~(1<<6) // and $0x30, %r8
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,%function
+.align 4
+vpaes_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov w3, #0 // mov $0,%ecx
+ mov x8, #0x30 // mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor x0, x0, x0
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,%function
+.align 4
+vpaes_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl w9, w9, #4 // shl $4,%eax
+ add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
+ add x2, x2, x9
+
+ mov w3, #1 // mov $1,%ecx
+ lsr w8, w1, #1 // shr $1,%r8d
+ and x8, x8, #32 // and $32,%r8d
+ eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+.globl vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,%function
+.align 4
+vpaes_cbc_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ cbz x2, .Lcbc_abort
+ cmp w5, #0 // check direction
+ b.eq vpaes_cbc_decrypt
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+
+ ld1 {v0.16b}, [x4] // load ivec
+ bl _vpaes_encrypt_preheat
+ b .Lcbc_enc_loop
+
+.align 4
+.Lcbc_enc_loop:
+ ld1 {v7.16b}, [x0],#16 // load input
+ eor v7.16b, v7.16b, v0.16b // xor with ivec
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1],#16 // save output
+ subs x17, x17, #16
+ b.hi .Lcbc_enc_loop
+
+ st1 {v0.16b}, [x4] // write ivec
+
+ ldp x29,x30,[sp],#16
+.Lcbc_abort:
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type vpaes_cbc_decrypt,%function
+.align 4
+vpaes_cbc_decrypt:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+ // only from vpaes_cbc_encrypt which has already signed the return address.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+ ld1 {v6.16b}, [x4] // load ivec
+ bl _vpaes_decrypt_preheat
+ tst x17, #16
+ b.eq .Lcbc_dec_loop2x
+
+ ld1 {v7.16b}, [x0], #16 // load input
+ bl _vpaes_decrypt_core
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ orr v6.16b, v7.16b, v7.16b // next ivec value
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #16
+ b.ls .Lcbc_dec_done
+
+.align 4
+.Lcbc_dec_loop2x:
+ ld1 {v14.16b,v15.16b}, [x0], #32
+ bl _vpaes_decrypt_2x
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ eor v1.16b, v1.16b, v14.16b
+ orr v6.16b, v15.16b, v15.16b
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #32
+ b.hi .Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+ st1 {v6.16b}, [x4]
+
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+.globl vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type vpaes_ctr32_encrypt_blocks,%function
+.align 4
+vpaes_ctr32_encrypt_blocks:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ cbz x2, .Lctr32_done
+
+ // Note, unlike the other functions, x2 here is measured in blocks,
+ // not bytes.
+ mov x17, x2
+ mov x2, x3
+
+ // Load the IV and counter portion.
+ ldr w6, [x4, #12]
+ ld1 {v7.16b}, [x4]
+
+ bl _vpaes_encrypt_preheat
+ tst x17, #1
+ rev w6, w6 // The counter is big-endian.
+ b.eq .Lctr32_prep_loop
+
+ // Handle one block so the remaining block count is even for
+ // _vpaes_encrypt_2x.
+ ld1 {v6.16b}, [x0], #16 // .Load input ahead of time
+ bl _vpaes_encrypt_core
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #1
+ // Update the counter.
+ add w6, w6, #1
+ rev w7, w6
+ mov v7.s[3], w7
+ b.ls .Lctr32_done
+
+.Lctr32_prep_loop:
+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+ // uses v14 and v15.
+ mov v15.16b, v7.16b
+ mov v14.16b, v7.16b
+ add w6, w6, #1
+ rev w7, w6
+ mov v15.s[3], w7
+
+.Lctr32_loop:
+ ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time
+ bl _vpaes_encrypt_2x
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #2
+ // Update the counter.
+ add w7, w6, #1
+ add w6, w6, #2
+ rev w7, w7
+ mov v14.s[3], w7
+ rev w7, w6
+ mov v15.s[3], w7
+ b.hi .Lctr32_loop
+
+.Lctr32_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/bcm/vpaes-armv8-win.S b/gen/bcm/vpaes-armv8-win.S
new file mode 100644
index 0000000..d399d22
--- /dev/null
+++ b/gen/bcm/vpaes-armv8-win.S
@@ -0,0 +1,1262 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.section .rodata
+
+
+.align 7 // totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward: // mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward: // mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr: // sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv: // inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt: // input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo: // sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1: // sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2: // sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+// Decryption stuff
+//
+Lk_dipt: // decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo: // decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9: // decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd: // decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb: // decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe: // decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+// Key schedule constants
+//
+Lk_dksd: // decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb: // decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse: // decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9: // decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon: // rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt: // output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew: // deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+
+.align 6
+
+.text
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.def _vpaes_encrypt_preheat
+ .type 32
+.endef
+.align 4
+_vpaes_encrypt_preheat:
+ adrp x10, Lk_inv
+ add x10, x10, :lo12:Lk_inv
+ movi v17.16b, #0x0f
+ ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2
+ ret
+
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.def _vpaes_encrypt_core
+ .type 32
+.endef
+.align 4
+_vpaes_encrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, Lk_mc_forward+16
+ add x11, x11, :lo12:Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b Lenc_entry
+
+.align 4
+Lenc_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ sub w8, w8, #1 // nr--
+
+Lenc_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, Lenc_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ ret
+
+
+.globl vpaes_encrypt
+
+.def vpaes_encrypt
+ .type 32
+.endef
+.align 4
+vpaes_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_encrypt_preheat
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.def _vpaes_encrypt_2x
+ .type 32
+.endef
+.align 4
+_vpaes_encrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, Lk_mc_forward+16
+ add x11, x11, :lo12:Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ tbl v9.16b, {v20.16b}, v9.16b
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ tbl v10.16b, {v21.16b}, v8.16b
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v8.16b, v9.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b Lenc_2x_entry
+
+.align 4
+Lenc_2x_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ tbl v12.16b, {v25.16b}, v10.16b
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v24.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ tbl v13.16b, {v27.16b}, v10.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ tbl v10.16b, {v26.16b}, v11.16b
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ tbl v11.16b, {v8.16b}, v1.16b
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ eor v10.16b, v10.16b, v13.16b
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ tbl v8.16b, {v8.16b}, v4.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ eor v11.16b, v11.16b, v10.16b
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ tbl v12.16b, {v11.16b},v1.16b
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ eor v8.16b, v8.16b, v11.16b
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ eor v8.16b, v8.16b, v12.16b
+ sub w8, w8, #1 // nr--
+
+Lenc_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ tbl v13.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v13.16b
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v13.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, Lenc_2x_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v23.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v1.16b
+ ret
+
+
+.def _vpaes_decrypt_preheat
+ .type 32
+.endef
+.align 4
+_vpaes_decrypt_preheat:
+ adrp x10, Lk_inv
+ add x10, x10, :lo12:Lk_inv
+ movi v17.16b, #0x0f
+ adrp x11, Lk_dipt
+ add x11, x11, :lo12:Lk_dipt
+ ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe
+ ret
+
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.def _vpaes_decrypt_core
+ .type 32
+.endef
+.align 4
+_vpaes_decrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, Lk_sr
+ add x10, x10, :lo12:Lk_sr
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, Lk_mc_forward+48
+ add x10, x10, :lo12:Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b Ldec_entry
+
+.align 4
+Ldec_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+Ldec_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, Ldec_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ ret
+
+
+.globl vpaes_decrypt
+
+.def vpaes_decrypt
+ .type 32
+.endef
+.align 4
+vpaes_decrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_decrypt_preheat
+ bl _vpaes_decrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// v14-v15 input, v0-v1 output
+.def _vpaes_decrypt_2x
+ .type 32
+.endef
+.align 4
+_vpaes_decrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, Lk_sr
+ add x10, x10, :lo12:Lk_sr
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, Lk_mc_forward+48
+ add x10, x10, :lo12:Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ tbl v10.16b, {v20.16b},v9.16b
+ ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ tbl v8.16b, {v21.16b},v8.16b
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v10.16b, v10.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b Ldec_2x_entry
+
+.align 4
+Ldec_2x_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v12.16b, {v24.16b}, v10.16b
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ tbl v9.16b, {v25.16b}, v11.16b
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ eor v8.16b, v12.16b, v16.16b
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v12.16b, {v26.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ tbl v9.16b, {v27.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v12.16b, {v28.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ tbl v9.16b, {v29.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v12.16b, {v30.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ tbl v9.16b, {v31.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+Ldec_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ tbl v10.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v10.16b
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v10.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, Ldec_2x_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ tbl v9.16b, {v23.16b}, v11.16b
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ eor v8.16b, v9.16b, v12.16b
+ tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v2.16b
+ ret
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.def _vpaes_key_preheat
+ .type 32
+.endef
+.align 4
+_vpaes_key_preheat:
+ adrp x10, Lk_inv
+ add x10, x10, :lo12:Lk_inv
+ movi v16.16b, #0x5b // Lk_s63
+ adrp x11, Lk_sb1
+ add x11, x11, :lo12:Lk_sb1
+ movi v17.16b, #0x0f // Lk_s0F
+ ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt
+ adrp x10, Lk_dksd
+ add x10, x10, :lo12:Lk_dksd
+ ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1
+ adrp x11, Lk_mc_forward
+ add x11, x11, :lo12:Lk_mc_forward
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9
+ ld1 {v8.2d}, [x10] // Lk_rcon
+ ld1 {v9.2d}, [x11] // Lk_mc_forward[0]
+ ret
+
+
+.def _vpaes_schedule_core
+ .type 32
+.endef
+.align 4
+_vpaes_schedule_core:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp,#-16]!
+ add x29,sp,#0
+
+ bl _vpaes_key_preheat // load the tables
+
+ ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ // input transform
+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
+
+ adrp x10, Lk_sr // lea Lk_sr(%rip),%r10
+ add x10, x10, :lo12:Lk_sr
+
+ add x8, x8, x10
+ cbnz w3, Lschedule_am_decrypting
+
+ // encrypting, output zeroth round key after transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
+ b Lschedule_go
+
+Lschedule_am_decrypting:
+ // decrypting, output zeroth round key after shiftrows
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ eor x8, x8, #0x30 // xor $0x30, %r8
+
+Lschedule_go:
+ cmp w1, #192 // cmp $192, %esi
+ b.hi Lschedule_256
+ b.eq Lschedule_192
+ // 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+Lschedule_128:
+ mov x0, #10 // mov $10, %esi
+
+Loop_schedule_128:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // write output
+ b Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+Lschedule_192:
+ sub x0, x0, #8
+ ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov x0, #4 // mov $4, %esi
+
+Loop_schedule_192:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle // save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle // save key n+1
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // save key n+2
+ bl _vpaes_schedule_192_smear
+ b Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+Lschedule_256:
+ ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov x0, #7 // mov $7, %esi
+
+Loop_schedule_256:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_mangle // output low result
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ // high round
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ // low round. swap xmm7 and xmm6
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ movi v4.16b, #0
+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
+
+ b Loop_schedule_256
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+Lschedule_mangle_last:
+ // schedule last round key from xmm0
+ adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew
+ add x11, x11, :lo12:Lk_deskew
+
+ cbnz w3, Lschedule_mangle_last_dec
+
+ // encrypting
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
+ adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform
+ add x11, x11, :lo12:Lk_opt
+ add x2, x2, #32 // add $32, %rdx
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+Lschedule_mangle_last_dec:
+ ld1 {v20.2d,v21.2d}, [x11] // reload constants
+ sub x2, x2, #16 // add $-16, %rdx
+ eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform // output transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
+
+ // cleanup
+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
+ ldp x29, x30, [sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.def _vpaes_schedule_192_smear
+ .type 32
+.endef
+.align 4
+_vpaes_schedule_192_smear:
+ movi v1.16b, #0
+ dup v0.4s, v7.s[3]
+ ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ ret
+
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.def _vpaes_schedule_round
+ .type 32
+.endef
+.align 4
+_vpaes_schedule_round:
+ // extract rcon from xmm8
+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
+ ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
+ ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+
+ // rotate
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ // fall through...
+
+ // low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ // smear xmm7
+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
+
+ // subbytes
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
+ tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ // add in smeared stuff
+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
+ ret
+
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.def _vpaes_schedule_transform
+ .type 32
+.endef
+.align 4
+_vpaes_schedule_transform:
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ // vmovdqa (%r11), %xmm2 # lo
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ // vmovdqa 16(%r11), %xmm1 # hi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ ret
+
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.def _vpaes_schedule_mangle
+ .type 32
+.endef
+.align 4
+_vpaes_schedule_mangle:
+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ // vmovdqa .Lk_mc_forward(%rip),%xmm5
+ cbnz w3, Lschedule_mangle_dec
+
+ // encrypting
+ eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
+ add x2, x2, #16 // add $16, %rdx
+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
+
+ b Lschedule_mangle_both
+.align 4
+Lschedule_mangle_dec:
+ // inverse mix columns
+ // lea .Lk_dksd(%rip),%r11
+ ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ // vmovdqa 0x00(%r11), %xmm2
+ tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ // vmovdqa 0x10(%r11), %xmm3
+ tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x20(%r11), %xmm2
+ tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x30(%r11), %xmm3
+ tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x40(%r11), %xmm2
+ tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x50(%r11), %xmm3
+ tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+
+ // vmovdqa 0x60(%r11), %xmm2
+ tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+ // vmovdqa 0x70(%r11), %xmm4
+ tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
+
+ sub x2, x2, #16 // add $-16, %rdx
+
+Lschedule_mangle_both:
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ add x8, x8, #48 // add $-16, %r8
+ and x8, x8, #~(1<<6) // and $0x30, %r8
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ ret
+
+
+.globl vpaes_set_encrypt_key
+
+.def vpaes_set_encrypt_key
+ .type 32
+.endef
+.align 4
+vpaes_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov w3, #0 // mov $0,%ecx
+ mov x8, #0x30 // mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor x0, x0, x0
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl vpaes_set_decrypt_key
+
+.def vpaes_set_decrypt_key
+ .type 32
+.endef
+.align 4
+vpaes_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl w9, w9, #4 // shl $4,%eax
+ add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
+ add x2, x2, x9
+
+ mov w3, #1 // mov $1,%ecx
+ lsr w8, w1, #1 // shr $1,%r8d
+ and x8, x8, #32 // and $32,%r8d
+ eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl vpaes_cbc_encrypt
+
+.def vpaes_cbc_encrypt
+ .type 32
+.endef
+.align 4
+vpaes_cbc_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ cbz x2, Lcbc_abort
+ cmp w5, #0 // check direction
+ b.eq vpaes_cbc_decrypt
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+
+ ld1 {v0.16b}, [x4] // load ivec
+ bl _vpaes_encrypt_preheat
+ b Lcbc_enc_loop
+
+.align 4
+Lcbc_enc_loop:
+ ld1 {v7.16b}, [x0],#16 // load input
+ eor v7.16b, v7.16b, v0.16b // xor with ivec
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1],#16 // save output
+ subs x17, x17, #16
+ b.hi Lcbc_enc_loop
+
+ st1 {v0.16b}, [x4] // write ivec
+
+ ldp x29,x30,[sp],#16
+Lcbc_abort:
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.def vpaes_cbc_decrypt
+ .type 32
+.endef
+.align 4
+vpaes_cbc_decrypt:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+ // only from vpaes_cbc_encrypt which has already signed the return address.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+ ld1 {v6.16b}, [x4] // load ivec
+ bl _vpaes_decrypt_preheat
+ tst x17, #16
+ b.eq Lcbc_dec_loop2x
+
+ ld1 {v7.16b}, [x0], #16 // load input
+ bl _vpaes_decrypt_core
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ orr v6.16b, v7.16b, v7.16b // next ivec value
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #16
+ b.ls Lcbc_dec_done
+
+.align 4
+Lcbc_dec_loop2x:
+ ld1 {v14.16b,v15.16b}, [x0], #32
+ bl _vpaes_decrypt_2x
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ eor v1.16b, v1.16b, v14.16b
+ orr v6.16b, v15.16b, v15.16b
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #32
+ b.hi Lcbc_dec_loop2x
+
+Lcbc_dec_done:
+ st1 {v6.16b}, [x4]
+
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl vpaes_ctr32_encrypt_blocks
+
+.def vpaes_ctr32_encrypt_blocks
+ .type 32
+.endef
+.align 4
+vpaes_ctr32_encrypt_blocks:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ cbz x2, Lctr32_done
+
+ // Note, unlike the other functions, x2 here is measured in blocks,
+ // not bytes.
+ mov x17, x2
+ mov x2, x3
+
+ // Load the IV and counter portion.
+ ldr w6, [x4, #12]
+ ld1 {v7.16b}, [x4]
+
+ bl _vpaes_encrypt_preheat
+ tst x17, #1
+ rev w6, w6 // The counter is big-endian.
+ b.eq Lctr32_prep_loop
+
+ // Handle one block so the remaining block count is even for
+ // _vpaes_encrypt_2x.
+ ld1 {v6.16b}, [x0], #16 // Load input ahead of time
+ bl _vpaes_encrypt_core
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #1
+ // Update the counter.
+ add w6, w6, #1
+ rev w7, w6
+ mov v7.s[3], w7
+ b.ls Lctr32_done
+
+Lctr32_prep_loop:
+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+ // uses v14 and v15.
+ mov v15.16b, v7.16b
+ mov v14.16b, v7.16b
+ add w6, w6, #1
+ rev w7, w6
+ mov v15.s[3], w7
+
+Lctr32_loop:
+ ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
+ bl _vpaes_encrypt_2x
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #2
+ // Update the counter.
+ add w7, w6, #1
+ add w6, w6, #2
+ rev w7, w7
+ mov v14.s[3], w7
+ rev w7, w6
+ mov v15.s[3], w7
+ b.hi Lctr32_loop
+
+Lctr32_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/bcm/vpaes-x86-apple.S b/gen/bcm/vpaes-x86-apple.S
new file mode 100644
index 0000000..4d2c485
--- /dev/null
+++ b/gen/bcm/vpaes-x86-apple.S
@@ -0,0 +1,680 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align 6,0x90
+L_vpaes_consts:
+.long 218628480,235210255,168496130,67568393
+.long 252381056,17041926,33884169,51187212
+.long 252645135,252645135,252645135,252645135
+.long 1512730624,3266504856,1377990664,3401244816
+.long 830229760,1275146365,2969422977,3447763452
+.long 3411033600,2979783055,338359620,2782886510
+.long 4209124096,907596821,221174255,1006095553
+.long 191964160,3799684038,3164090317,1589111125
+.long 182528256,1777043520,2877432650,3265356744
+.long 1874708224,3503451415,3305285752,363511674
+.long 1606117888,3487855781,1093350906,2384367825
+.long 197121,67569157,134941193,202313229
+.long 67569157,134941193,202313229,197121
+.long 134941193,202313229,197121,67569157
+.long 202313229,197121,67569157,134941193
+.long 33619971,100992007,168364043,235736079
+.long 235736079,33619971,100992007,168364043
+.long 168364043,235736079,33619971,100992007
+.long 100992007,168364043,235736079,33619971
+.long 50462976,117835012,185207048,252579084
+.long 252314880,51251460,117574920,184942860
+.long 184682752,252054788,50987272,118359308
+.long 118099200,185467140,251790600,50727180
+.long 2946363062,528716217,1300004225,1881839624
+.long 1532713819,1532713819,1532713819,1532713819
+.long 3602276352,4288629033,3737020424,4153884961
+.long 1354558464,32357713,2958822624,3775749553
+.long 1201988352,132424512,1572796698,503232858
+.long 2213177600,1597421020,4103937655,675398315
+.long 2749646592,4273543773,1511898873,121693092
+.long 3040248576,1103263732,2871565598,1608280554
+.long 2236667136,2588920351,482954393,64377734
+.long 3069987328,291237287,2117370568,3650299247
+.long 533321216,3573750986,2572112006,1401264716
+.long 1339849704,2721158661,548607111,3445553514
+.long 2128193280,3054596040,2183486460,1257083700
+.long 655635200,1165381986,3923443150,2344132524
+.long 190078720,256924420,290342170,357187870
+.long 1610966272,2263057382,4103205268,309794674
+.long 2592527872,2233205587,1335446729,3402964816
+.long 3973531904,3225098121,3002836325,1918774430
+.long 3870401024,2102906079,2284471353,4117666579
+.long 617007872,1021508343,366931923,691083277
+.long 2528395776,3491914898,2968704004,1613121270
+.long 3445188352,3247741094,844474987,4093578302
+.long 651481088,1190302358,1689581232,574775300
+.long 4289380608,206939853,2555985458,2489840491
+.long 2130264064,327674451,3566485037,3349835193
+.long 2470714624,316102159,3636825756,3393945945
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte 118,101,114,115,105,116,121,41,0
+.align 6,0x90
+.private_extern __vpaes_preheat
+.align 4
+__vpaes_preheat:
+ addl (%esp),%ebp
+ movdqa -48(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm6
+ ret
+.private_extern __vpaes_encrypt_core
+.align 4
+__vpaes_encrypt_core:
+ movl $16,%ecx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa (%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ movdqu (%edx),%xmm5
+.byte 102,15,56,0,208
+ movdqa 16(%ebp),%xmm0
+ pxor %xmm5,%xmm2
+ psrld $4,%xmm1
+ addl $16,%edx
+.byte 102,15,56,0,193
+ leal 192(%ebp),%ebx
+ pxor %xmm2,%xmm0
+ jmp L000enc_entry
+.align 4,0x90
+L001enc_loop:
+ movdqa 32(%ebp),%xmm4
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa 64(%ebp),%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%ebx,%ecx,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa 80(%ebp),%xmm2
+ movdqa (%ebx,%ecx,1),%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addl $16,%edx
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addl $16,%ecx
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ subl $1,%eax
+ pxor %xmm3,%xmm0
+L000enc_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm7,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm5
+ pxor %xmm1,%xmm3
+ jnz L001enc_loop
+ movdqa 96(%ebp),%xmm4
+ movdqa 112(%ebp),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%ebx,%ecx,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+.private_extern __vpaes_decrypt_core
+.align 4
+__vpaes_decrypt_core:
+ leal 608(%ebp),%ebx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa -64(%ebx),%xmm2
+ pandn %xmm0,%xmm1
+ movl %eax,%ecx
+ psrld $4,%xmm1
+ movdqu (%edx),%xmm5
+ shll $4,%ecx
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa -48(%ebx),%xmm0
+ xorl $48,%ecx
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm5,%xmm2
+ movdqa 176(%ebp),%xmm5
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal -352(%ebx,%ecx,1),%ecx
+ jmp L002dec_entry
+.align 4,0x90
+L003dec_loop:
+ movdqa -32(%ebx),%xmm4
+ movdqa -16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa (%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addl $16,%edx
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subl $1,%eax
+L002dec_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ psrld $4,%xmm1
+.byte 102,15,56,0,208
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm0
+ pxor %xmm1,%xmm3
+ jnz L003dec_loop
+ movdqa 96(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%ebx),%xmm0
+ movdqa (%ecx),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+.private_extern __vpaes_schedule_core
+.align 4
+__vpaes_schedule_core:
+ addl (%esp),%ebp
+ movdqu (%esi),%xmm0
+ movdqa 320(%ebp),%xmm2
+ movdqa %xmm0,%xmm3
+ leal (%ebp),%ebx
+ movdqa %xmm2,4(%esp)
+ call __vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+ testl %edi,%edi
+ jnz L004schedule_am_decrypting
+ movdqu %xmm0,(%edx)
+ jmp L005schedule_go
+L004schedule_am_decrypting:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%edx)
+ xorl $48,%ecx
+L005schedule_go:
+ cmpl $192,%eax
+ ja L006schedule_256
+ je L007schedule_192
+L008schedule_128:
+ movl $10,%eax
+L009loop_schedule_128:
+ call __vpaes_schedule_round
+ decl %eax
+ jz L010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ jmp L009loop_schedule_128
+.align 4,0x90
+L007schedule_192:
+ movdqu 8(%esi),%xmm0
+ call __vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%eax
+L011loop_schedule_192:
+ call __vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_192_smear
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_round
+ decl %eax
+ jz L010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_192_smear
+ jmp L011loop_schedule_192
+.align 4,0x90
+L006schedule_256:
+ movdqu 16(%esi),%xmm0
+ call __vpaes_schedule_transform
+ movl $7,%eax
+L012loop_schedule_256:
+ call __vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+ call __vpaes_schedule_round
+ decl %eax
+ jz L010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,20(%esp)
+ movdqa %xmm6,%xmm7
+ call L_vpaes_schedule_low_round
+ movdqa 20(%esp),%xmm7
+ jmp L012loop_schedule_256
+.align 4,0x90
+L010schedule_mangle_last:
+ leal 384(%ebp),%ebx
+ testl %edi,%edi
+ jnz L013schedule_mangle_last_dec
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,193
+ leal 352(%ebp),%ebx
+ addl $32,%edx
+L013schedule_mangle_last_dec:
+ addl $-16,%edx
+ pxor 336(%ebp),%xmm0
+ call __vpaes_schedule_transform
+ movdqu %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+.private_extern __vpaes_schedule_192_smear
+.align 4
+__vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm1
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ ret
+.private_extern __vpaes_schedule_round
+.align 4
+__vpaes_schedule_round:
+ movdqa 8(%esp),%xmm2
+ pxor %xmm1,%xmm1
+.byte 102,15,58,15,202,15
+.byte 102,15,58,15,210,15
+ pxor %xmm1,%xmm7
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+ movdqa %xmm2,8(%esp)
+L_vpaes_schedule_low_round:
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor 336(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm4
+ movdqa -48(%ebp),%xmm5
+ movdqa %xmm4,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm4,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm5,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm5,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+.private_extern __vpaes_schedule_transform
+.align 4
+__vpaes_schedule_transform:
+ movdqa -16(%ebp),%xmm2
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ movdqa (%ebx),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+.private_extern __vpaes_schedule_mangle
+.align 4
+__vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa 128(%ebp),%xmm5
+ testl %edi,%edi
+ jnz L014schedule_mangle_dec
+ addl $16,%edx
+ pxor 336(%ebp),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+ jmp L015schedule_mangle_both
+.align 4,0x90
+L014schedule_mangle_dec:
+ movdqa -16(%ebp),%xmm2
+ leal 416(%ebp),%esi
+ movdqa %xmm2,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm4
+ movdqa (%esi),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 32(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 64(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 96(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ addl $-16,%edx
+L015schedule_mangle_both:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ addl $-16,%ecx
+ andl $48,%ecx
+ movdqu %xmm3,(%edx)
+ ret
+.globl _vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+.align 4
+_vpaes_set_encrypt_key:
+L_vpaes_set_encrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call L016pic
+L016pic:
+ popl %ebx
+ leal _BORINGSSL_function_hit+5-L016pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ movl $48,%ecx
+ movl $0,%edi
+ leal L_vpaes_consts+0x30-L017pic_point,%ebp
+ call __vpaes_schedule_core
+L017pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_set_decrypt_key
+.private_extern _vpaes_set_decrypt_key
+.align 4
+_vpaes_set_decrypt_key:
+L_vpaes_set_decrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ shll $4,%ebx
+ leal 16(%edx,%ebx,1),%edx
+ movl $1,%edi
+ movl %eax,%ecx
+ shrl $1,%ecx
+ andl $32,%ecx
+ xorl $32,%ecx
+ leal L_vpaes_consts+0x30-L018pic_point,%ebp
+ call __vpaes_schedule_core
+L018pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_encrypt
+.private_extern _vpaes_encrypt
+.align 4
+_vpaes_encrypt:
+L_vpaes_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call L019pic
+L019pic:
+ popl %ebx
+ leal _BORINGSSL_function_hit+4-L019pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ leal L_vpaes_consts+0x30-L020pic_point,%ebp
+ call __vpaes_preheat
+L020pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call __vpaes_encrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_decrypt
+.private_extern _vpaes_decrypt
+.align 4
+_vpaes_decrypt:
+L_vpaes_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal L_vpaes_consts+0x30-L021pic_point,%ebp
+ call __vpaes_preheat
+L021pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call __vpaes_decrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _vpaes_cbc_encrypt
+.private_extern _vpaes_cbc_encrypt
+.align 4
+_vpaes_cbc_encrypt:
+L_vpaes_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ subl $16,%eax
+ jc L022cbc_abort
+ leal -56(%esp),%ebx
+ movl 36(%esp),%ebp
+ andl $-16,%ebx
+ movl 40(%esp),%ecx
+ xchgl %esp,%ebx
+ movdqu (%ebp),%xmm1
+ subl %esi,%edi
+ movl %ebx,48(%esp)
+ movl %edi,(%esp)
+ movl %edx,4(%esp)
+ movl %ebp,8(%esp)
+ movl %eax,%edi
+ leal L_vpaes_consts+0x30-L023pic_point,%ebp
+ call __vpaes_preheat
+L023pic_point:
+ cmpl $0,%ecx
+ je L024cbc_dec_loop
+ jmp L025cbc_enc_loop
+.align 4,0x90
+L025cbc_enc_loop:
+ movdqu (%esi),%xmm0
+ pxor %xmm1,%xmm0
+ call __vpaes_encrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ movdqa %xmm0,%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc L025cbc_enc_loop
+ jmp L026cbc_done
+.align 4,0x90
+L024cbc_dec_loop:
+ movdqu (%esi),%xmm0
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm0,32(%esp)
+ call __vpaes_decrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ pxor 16(%esp),%xmm0
+ movdqa 32(%esp),%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc L024cbc_dec_loop
+L026cbc_done:
+ movl 8(%esp),%ebx
+ movl 48(%esp),%esp
+ movdqu %xmm1,(%ebx)
+L022cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/vpaes-x86-linux.S b/gen/bcm/vpaes-x86-linux.S
new file mode 100644
index 0000000..02786a7
--- /dev/null
+++ b/gen/bcm/vpaes-x86-linux.S
@@ -0,0 +1,706 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align 64
+.L_vpaes_consts:
+.long 218628480,235210255,168496130,67568393
+.long 252381056,17041926,33884169,51187212
+.long 252645135,252645135,252645135,252645135
+.long 1512730624,3266504856,1377990664,3401244816
+.long 830229760,1275146365,2969422977,3447763452
+.long 3411033600,2979783055,338359620,2782886510
+.long 4209124096,907596821,221174255,1006095553
+.long 191964160,3799684038,3164090317,1589111125
+.long 182528256,1777043520,2877432650,3265356744
+.long 1874708224,3503451415,3305285752,363511674
+.long 1606117888,3487855781,1093350906,2384367825
+.long 197121,67569157,134941193,202313229
+.long 67569157,134941193,202313229,197121
+.long 134941193,202313229,197121,67569157
+.long 202313229,197121,67569157,134941193
+.long 33619971,100992007,168364043,235736079
+.long 235736079,33619971,100992007,168364043
+.long 168364043,235736079,33619971,100992007
+.long 100992007,168364043,235736079,33619971
+.long 50462976,117835012,185207048,252579084
+.long 252314880,51251460,117574920,184942860
+.long 184682752,252054788,50987272,118359308
+.long 118099200,185467140,251790600,50727180
+.long 2946363062,528716217,1300004225,1881839624
+.long 1532713819,1532713819,1532713819,1532713819
+.long 3602276352,4288629033,3737020424,4153884961
+.long 1354558464,32357713,2958822624,3775749553
+.long 1201988352,132424512,1572796698,503232858
+.long 2213177600,1597421020,4103937655,675398315
+.long 2749646592,4273543773,1511898873,121693092
+.long 3040248576,1103263732,2871565598,1608280554
+.long 2236667136,2588920351,482954393,64377734
+.long 3069987328,291237287,2117370568,3650299247
+.long 533321216,3573750986,2572112006,1401264716
+.long 1339849704,2721158661,548607111,3445553514
+.long 2128193280,3054596040,2183486460,1257083700
+.long 655635200,1165381986,3923443150,2344132524
+.long 190078720,256924420,290342170,357187870
+.long 1610966272,2263057382,4103205268,309794674
+.long 2592527872,2233205587,1335446729,3402964816
+.long 3973531904,3225098121,3002836325,1918774430
+.long 3870401024,2102906079,2284471353,4117666579
+.long 617007872,1021508343,366931923,691083277
+.long 2528395776,3491914898,2968704004,1613121270
+.long 3445188352,3247741094,844474987,4093578302
+.long 651481088,1190302358,1689581232,574775300
+.long 4289380608,206939853,2555985458,2489840491
+.long 2130264064,327674451,3566485037,3349835193
+.long 2470714624,316102159,3636825756,3393945945
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte 118,101,114,115,105,116,121,41,0
+.align 64
+.hidden _vpaes_preheat
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+ addl (%esp),%ebp
+ movdqa -48(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm6
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+.hidden _vpaes_encrypt_core
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+ movl $16,%ecx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa (%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ movdqu (%edx),%xmm5
+.byte 102,15,56,0,208
+ movdqa 16(%ebp),%xmm0
+ pxor %xmm5,%xmm2
+ psrld $4,%xmm1
+ addl $16,%edx
+.byte 102,15,56,0,193
+ leal 192(%ebp),%ebx
+ pxor %xmm2,%xmm0
+ jmp .L000enc_entry
+.align 16
+.L001enc_loop:
+ movdqa 32(%ebp),%xmm4
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa 64(%ebp),%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%ebx,%ecx,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa 80(%ebp),%xmm2
+ movdqa (%ebx,%ecx,1),%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addl $16,%edx
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addl $16,%ecx
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ subl $1,%eax
+ pxor %xmm3,%xmm0
+.L000enc_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm7,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm5
+ pxor %xmm1,%xmm3
+ jnz .L001enc_loop
+ movdqa 96(%ebp),%xmm4
+ movdqa 112(%ebp),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%ebx,%ecx,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+.hidden _vpaes_decrypt_core
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+ leal 608(%ebp),%ebx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa -64(%ebx),%xmm2
+ pandn %xmm0,%xmm1
+ movl %eax,%ecx
+ psrld $4,%xmm1
+ movdqu (%edx),%xmm5
+ shll $4,%ecx
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa -48(%ebx),%xmm0
+ xorl $48,%ecx
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm5,%xmm2
+ movdqa 176(%ebp),%xmm5
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal -352(%ebx,%ecx,1),%ecx
+ jmp .L002dec_entry
+.align 16
+.L003dec_loop:
+ movdqa -32(%ebx),%xmm4
+ movdqa -16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa (%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addl $16,%edx
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subl $1,%eax
+.L002dec_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ psrld $4,%xmm1
+.byte 102,15,56,0,208
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm0
+ pxor %xmm1,%xmm3
+ jnz .L003dec_loop
+ movdqa 96(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%ebx),%xmm0
+ movdqa (%ecx),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+.hidden _vpaes_schedule_core
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+ addl (%esp),%ebp
+ movdqu (%esi),%xmm0
+ movdqa 320(%ebp),%xmm2
+ movdqa %xmm0,%xmm3
+ leal (%ebp),%ebx
+ movdqa %xmm2,4(%esp)
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+ testl %edi,%edi
+ jnz .L004schedule_am_decrypting
+ movdqu %xmm0,(%edx)
+ jmp .L005schedule_go
+.L004schedule_am_decrypting:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%edx)
+ xorl $48,%ecx
+.L005schedule_go:
+ cmpl $192,%eax
+ ja .L006schedule_256
+ je .L007schedule_192
+.L008schedule_128:
+ movl $10,%eax
+.L009loop_schedule_128:
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .L009loop_schedule_128
+.align 16
+.L007schedule_192:
+ movdqu 8(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%eax
+.L011loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .L011loop_schedule_192
+.align 16
+.L006schedule_256:
+ movdqu 16(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%eax
+.L012loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,20(%esp)
+ movdqa %xmm6,%xmm7
+ call .L_vpaes_schedule_low_round
+ movdqa 20(%esp),%xmm7
+ jmp .L012loop_schedule_256
+.align 16
+.L010schedule_mangle_last:
+ leal 384(%ebp),%ebx
+ testl %edi,%edi
+ jnz .L013schedule_mangle_last_dec
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,193
+ leal 352(%ebp),%ebx
+ addl $32,%edx
+.L013schedule_mangle_last_dec:
+ addl $-16,%edx
+ pxor 336(%ebp),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+.hidden _vpaes_schedule_192_smear
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm1
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.hidden _vpaes_schedule_round
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+ movdqa 8(%esp),%xmm2
+ pxor %xmm1,%xmm1
+.byte 102,15,58,15,202,15
+.byte 102,15,58,15,210,15
+ pxor %xmm1,%xmm7
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+ movdqa %xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor 336(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm4
+ movdqa -48(%ebp),%xmm5
+ movdqa %xmm4,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm4,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm5,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm5,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+.hidden _vpaes_schedule_transform
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+ movdqa -16(%ebp),%xmm2
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ movdqa (%ebx),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+.hidden _vpaes_schedule_mangle
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa 128(%ebp),%xmm5
+ testl %edi,%edi
+ jnz .L014schedule_mangle_dec
+ addl $16,%edx
+ pxor 336(%ebp),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+ jmp .L015schedule_mangle_both
+.align 16
+.L014schedule_mangle_dec:
+ movdqa -16(%ebp),%xmm2
+ leal 416(%ebp),%esi
+ movdqa %xmm2,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm4
+ movdqa (%esi),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 32(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 64(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 96(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ addl $-16,%edx
+.L015schedule_mangle_both:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ addl $-16,%ecx
+ andl $48,%ecx
+ movdqu %xmm3,(%edx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L016pic
+.L016pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ movl $48,%ecx
+ movl $0,%edi
+ leal .L_vpaes_consts+0x30-.L017pic_point,%ebp
+ call _vpaes_schedule_core
+.L017pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ shll $4,%ebx
+ leal 16(%edx,%ebx,1),%edx
+ movl $1,%edi
+ movl %eax,%ecx
+ shrl $1,%ecx
+ andl $32,%ecx
+ xorl $32,%ecx
+ leal .L_vpaes_consts+0x30-.L018pic_point,%ebp
+ call _vpaes_schedule_core
+.L018pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L019pic
+.L019pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ leal .L_vpaes_consts+0x30-.L020pic_point,%ebp
+ call _vpaes_preheat
+.L020pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
+ call _vpaes_preheat
+.L021pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ subl $16,%eax
+ jc .L022cbc_abort
+ leal -56(%esp),%ebx
+ movl 36(%esp),%ebp
+ andl $-16,%ebx
+ movl 40(%esp),%ecx
+ xchgl %esp,%ebx
+ movdqu (%ebp),%xmm1
+ subl %esi,%edi
+ movl %ebx,48(%esp)
+ movl %edi,(%esp)
+ movl %edx,4(%esp)
+ movl %ebp,8(%esp)
+ movl %eax,%edi
+ leal .L_vpaes_consts+0x30-.L023pic_point,%ebp
+ call _vpaes_preheat
+.L023pic_point:
+ cmpl $0,%ecx
+ je .L024cbc_dec_loop
+ jmp .L025cbc_enc_loop
+.align 16
+.L025cbc_enc_loop:
+ movdqu (%esi),%xmm0
+ pxor %xmm1,%xmm0
+ call _vpaes_encrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ movdqa %xmm0,%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L025cbc_enc_loop
+ jmp .L026cbc_done
+.align 16
+.L024cbc_dec_loop:
+ movdqu (%esi),%xmm0
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm0,32(%esp)
+ call _vpaes_decrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ pxor 16(%esp),%xmm0
+ movdqa 32(%esp),%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L024cbc_dec_loop
+.L026cbc_done:
+ movl 8(%esp),%ebx
+ movl 48(%esp),%esp
+ movdqu %xmm1,(%ebx)
+.L022cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/vpaes-x86-win.asm b/gen/bcm/vpaes-x86-win.asm
new file mode 100644
index 0000000..661496e
--- /dev/null
+++ b/gen/bcm/vpaes-x86-win.asm
@@ -0,0 +1,679 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+%ifdef BORINGSSL_DISPATCH_TEST
+extern _BORINGSSL_function_hit
+%endif
+align 64
+L$_vpaes_consts:
+dd 218628480,235210255,168496130,67568393
+dd 252381056,17041926,33884169,51187212
+dd 252645135,252645135,252645135,252645135
+dd 1512730624,3266504856,1377990664,3401244816
+dd 830229760,1275146365,2969422977,3447763452
+dd 3411033600,2979783055,338359620,2782886510
+dd 4209124096,907596821,221174255,1006095553
+dd 191964160,3799684038,3164090317,1589111125
+dd 182528256,1777043520,2877432650,3265356744
+dd 1874708224,3503451415,3305285752,363511674
+dd 1606117888,3487855781,1093350906,2384367825
+dd 197121,67569157,134941193,202313229
+dd 67569157,134941193,202313229,197121
+dd 134941193,202313229,197121,67569157
+dd 202313229,197121,67569157,134941193
+dd 33619971,100992007,168364043,235736079
+dd 235736079,33619971,100992007,168364043
+dd 168364043,235736079,33619971,100992007
+dd 100992007,168364043,235736079,33619971
+dd 50462976,117835012,185207048,252579084
+dd 252314880,51251460,117574920,184942860
+dd 184682752,252054788,50987272,118359308
+dd 118099200,185467140,251790600,50727180
+dd 2946363062,528716217,1300004225,1881839624
+dd 1532713819,1532713819,1532713819,1532713819
+dd 3602276352,4288629033,3737020424,4153884961
+dd 1354558464,32357713,2958822624,3775749553
+dd 1201988352,132424512,1572796698,503232858
+dd 2213177600,1597421020,4103937655,675398315
+dd 2749646592,4273543773,1511898873,121693092
+dd 3040248576,1103263732,2871565598,1608280554
+dd 2236667136,2588920351,482954393,64377734
+dd 3069987328,291237287,2117370568,3650299247
+dd 533321216,3573750986,2572112006,1401264716
+dd 1339849704,2721158661,548607111,3445553514
+dd 2128193280,3054596040,2183486460,1257083700
+dd 655635200,1165381986,3923443150,2344132524
+dd 190078720,256924420,290342170,357187870
+dd 1610966272,2263057382,4103205268,309794674
+dd 2592527872,2233205587,1335446729,3402964816
+dd 3973531904,3225098121,3002836325,1918774430
+dd 3870401024,2102906079,2284471353,4117666579
+dd 617007872,1021508343,366931923,691083277
+dd 2528395776,3491914898,2968704004,1613121270
+dd 3445188352,3247741094,844474987,4093578302
+dd 651481088,1190302358,1689581232,574775300
+dd 4289380608,206939853,2555985458,2489840491
+dd 2130264064,327674451,3566485037,3349835193
+dd 2470714624,316102159,3636825756,3393945945
+db 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+db 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+db 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+db 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+db 118,101,114,115,105,116,121,41,0
+align 64
+align 16
+__vpaes_preheat:
+ add ebp,DWORD [esp]
+ movdqa xmm7,[ebp-48]
+ movdqa xmm6,[ebp-16]
+ ret
+align 16
+__vpaes_encrypt_core:
+ mov ecx,16
+ mov eax,DWORD [240+edx]
+ movdqa xmm1,xmm6
+ movdqa xmm2,[ebp]
+ pandn xmm1,xmm0
+ pand xmm0,xmm6
+ movdqu xmm5,[edx]
+db 102,15,56,0,208
+ movdqa xmm0,[16+ebp]
+ pxor xmm2,xmm5
+ psrld xmm1,4
+ add edx,16
+db 102,15,56,0,193
+ lea ebx,[192+ebp]
+ pxor xmm0,xmm2
+ jmp NEAR L$000enc_entry
+align 16
+L$001enc_loop:
+ movdqa xmm4,[32+ebp]
+ movdqa xmm0,[48+ebp]
+db 102,15,56,0,226
+db 102,15,56,0,195
+ pxor xmm4,xmm5
+ movdqa xmm5,[64+ebp]
+ pxor xmm0,xmm4
+ movdqa xmm1,[ecx*1+ebx-64]
+db 102,15,56,0,234
+ movdqa xmm2,[80+ebp]
+ movdqa xmm4,[ecx*1+ebx]
+db 102,15,56,0,211
+ movdqa xmm3,xmm0
+ pxor xmm2,xmm5
+db 102,15,56,0,193
+ add edx,16
+ pxor xmm0,xmm2
+db 102,15,56,0,220
+ add ecx,16
+ pxor xmm3,xmm0
+db 102,15,56,0,193
+ and ecx,48
+ sub eax,1
+ pxor xmm0,xmm3
+L$000enc_entry:
+ movdqa xmm1,xmm6
+ movdqa xmm5,[ebp-32]
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm6
+db 102,15,56,0,232
+ movdqa xmm3,xmm7
+ pxor xmm0,xmm1
+db 102,15,56,0,217
+ movdqa xmm4,xmm7
+ pxor xmm3,xmm5
+db 102,15,56,0,224
+ movdqa xmm2,xmm7
+ pxor xmm4,xmm5
+db 102,15,56,0,211
+ movdqa xmm3,xmm7
+ pxor xmm2,xmm0
+db 102,15,56,0,220
+ movdqu xmm5,[edx]
+ pxor xmm3,xmm1
+ jnz NEAR L$001enc_loop
+ movdqa xmm4,[96+ebp]
+ movdqa xmm0,[112+ebp]
+db 102,15,56,0,226
+ pxor xmm4,xmm5
+db 102,15,56,0,195
+ movdqa xmm1,[64+ecx*1+ebx]
+ pxor xmm0,xmm4
+db 102,15,56,0,193
+ ret
+align 16
+__vpaes_decrypt_core:
+ lea ebx,[608+ebp]
+ mov eax,DWORD [240+edx]
+ movdqa xmm1,xmm6
+ movdqa xmm2,[ebx-64]
+ pandn xmm1,xmm0
+ mov ecx,eax
+ psrld xmm1,4
+ movdqu xmm5,[edx]
+ shl ecx,4
+ pand xmm0,xmm6
+db 102,15,56,0,208
+ movdqa xmm0,[ebx-48]
+ xor ecx,48
+db 102,15,56,0,193
+ and ecx,48
+ pxor xmm2,xmm5
+ movdqa xmm5,[176+ebp]
+ pxor xmm0,xmm2
+ add edx,16
+ lea ecx,[ecx*1+ebx-352]
+ jmp NEAR L$002dec_entry
+align 16
+L$003dec_loop:
+ movdqa xmm4,[ebx-32]
+ movdqa xmm1,[ebx-16]
+db 102,15,56,0,226
+db 102,15,56,0,203
+ pxor xmm0,xmm4
+ movdqa xmm4,[ebx]
+ pxor xmm0,xmm1
+ movdqa xmm1,[16+ebx]
+db 102,15,56,0,226
+db 102,15,56,0,197
+db 102,15,56,0,203
+ pxor xmm0,xmm4
+ movdqa xmm4,[32+ebx]
+ pxor xmm0,xmm1
+ movdqa xmm1,[48+ebx]
+db 102,15,56,0,226
+db 102,15,56,0,197
+db 102,15,56,0,203
+ pxor xmm0,xmm4
+ movdqa xmm4,[64+ebx]
+ pxor xmm0,xmm1
+ movdqa xmm1,[80+ebx]
+db 102,15,56,0,226
+db 102,15,56,0,197
+db 102,15,56,0,203
+ pxor xmm0,xmm4
+ add edx,16
+db 102,15,58,15,237,12
+ pxor xmm0,xmm1
+ sub eax,1
+L$002dec_entry:
+ movdqa xmm1,xmm6
+ movdqa xmm2,[ebp-32]
+ pandn xmm1,xmm0
+ pand xmm0,xmm6
+ psrld xmm1,4
+db 102,15,56,0,208
+ movdqa xmm3,xmm7
+ pxor xmm0,xmm1
+db 102,15,56,0,217
+ movdqa xmm4,xmm7
+ pxor xmm3,xmm2
+db 102,15,56,0,224
+ pxor xmm4,xmm2
+ movdqa xmm2,xmm7
+db 102,15,56,0,211
+ movdqa xmm3,xmm7
+ pxor xmm2,xmm0
+db 102,15,56,0,220
+ movdqu xmm0,[edx]
+ pxor xmm3,xmm1
+ jnz NEAR L$003dec_loop
+ movdqa xmm4,[96+ebx]
+db 102,15,56,0,226
+ pxor xmm4,xmm0
+ movdqa xmm0,[112+ebx]
+ movdqa xmm2,[ecx]
+db 102,15,56,0,195
+ pxor xmm0,xmm4
+db 102,15,56,0,194
+ ret
+align 16
+__vpaes_schedule_core:
+ add ebp,DWORD [esp]
+ movdqu xmm0,[esi]
+ movdqa xmm2,[320+ebp]
+ movdqa xmm3,xmm0
+ lea ebx,[ebp]
+ movdqa [4+esp],xmm2
+ call __vpaes_schedule_transform
+ movdqa xmm7,xmm0
+ test edi,edi
+ jnz NEAR L$004schedule_am_decrypting
+ movdqu [edx],xmm0
+ jmp NEAR L$005schedule_go
+L$004schedule_am_decrypting:
+ movdqa xmm1,[256+ecx*1+ebp]
+db 102,15,56,0,217
+ movdqu [edx],xmm3
+ xor ecx,48
+L$005schedule_go:
+ cmp eax,192
+ ja NEAR L$006schedule_256
+ je NEAR L$007schedule_192
+L$008schedule_128:
+ mov eax,10
+L$009loop_schedule_128:
+ call __vpaes_schedule_round
+ dec eax
+ jz NEAR L$010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ jmp NEAR L$009loop_schedule_128
+align 16
+L$007schedule_192:
+ movdqu xmm0,[8+esi]
+ call __vpaes_schedule_transform
+ movdqa xmm6,xmm0
+ pxor xmm4,xmm4
+ movhlps xmm6,xmm4
+ mov eax,4
+L$011loop_schedule_192:
+ call __vpaes_schedule_round
+db 102,15,58,15,198,8
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_192_smear
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_round
+ dec eax
+ jz NEAR L$010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ call __vpaes_schedule_192_smear
+ jmp NEAR L$011loop_schedule_192
+align 16
+L$006schedule_256:
+ movdqu xmm0,[16+esi]
+ call __vpaes_schedule_transform
+ mov eax,7
+L$012loop_schedule_256:
+ call __vpaes_schedule_mangle
+ movdqa xmm6,xmm0
+ call __vpaes_schedule_round
+ dec eax
+ jz NEAR L$010schedule_mangle_last
+ call __vpaes_schedule_mangle
+ pshufd xmm0,xmm0,255
+ movdqa [20+esp],xmm7
+ movdqa xmm7,xmm6
+ call L$_vpaes_schedule_low_round
+ movdqa xmm7,[20+esp]
+ jmp NEAR L$012loop_schedule_256
+align 16
+L$010schedule_mangle_last:
+ lea ebx,[384+ebp]
+ test edi,edi
+ jnz NEAR L$013schedule_mangle_last_dec
+ movdqa xmm1,[256+ecx*1+ebp]
+db 102,15,56,0,193
+ lea ebx,[352+ebp]
+ add edx,32
+L$013schedule_mangle_last_dec:
+ add edx,-16
+ pxor xmm0,[336+ebp]
+ call __vpaes_schedule_transform
+ movdqu [edx],xmm0
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+ ret
+align 16
+__vpaes_schedule_192_smear:
+ pshufd xmm1,xmm6,128
+ pshufd xmm0,xmm7,254
+ pxor xmm6,xmm1
+ pxor xmm1,xmm1
+ pxor xmm6,xmm0
+ movdqa xmm0,xmm6
+ movhlps xmm6,xmm1
+ ret
+align 16
+__vpaes_schedule_round:
+ movdqa xmm2,[8+esp]
+ pxor xmm1,xmm1
+db 102,15,58,15,202,15
+db 102,15,58,15,210,15
+ pxor xmm7,xmm1
+ pshufd xmm0,xmm0,255
+db 102,15,58,15,192,1
+ movdqa [8+esp],xmm2
+L$_vpaes_schedule_low_round:
+ movdqa xmm1,xmm7
+ pslldq xmm7,4
+ pxor xmm7,xmm1
+ movdqa xmm1,xmm7
+ pslldq xmm7,8
+ pxor xmm7,xmm1
+ pxor xmm7,[336+ebp]
+ movdqa xmm4,[ebp-16]
+ movdqa xmm5,[ebp-48]
+ movdqa xmm1,xmm4
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm4
+ movdqa xmm2,[ebp-32]
+db 102,15,56,0,208
+ pxor xmm0,xmm1
+ movdqa xmm3,xmm5
+db 102,15,56,0,217
+ pxor xmm3,xmm2
+ movdqa xmm4,xmm5
+db 102,15,56,0,224
+ pxor xmm4,xmm2
+ movdqa xmm2,xmm5
+db 102,15,56,0,211
+ pxor xmm2,xmm0
+ movdqa xmm3,xmm5
+db 102,15,56,0,220
+ pxor xmm3,xmm1
+ movdqa xmm4,[32+ebp]
+db 102,15,56,0,226
+ movdqa xmm0,[48+ebp]
+db 102,15,56,0,195
+ pxor xmm0,xmm4
+ pxor xmm0,xmm7
+ movdqa xmm7,xmm0
+ ret
+align 16
+__vpaes_schedule_transform:
+ movdqa xmm2,[ebp-16]
+ movdqa xmm1,xmm2
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm2
+ movdqa xmm2,[ebx]
+db 102,15,56,0,208
+ movdqa xmm0,[16+ebx]
+db 102,15,56,0,193
+ pxor xmm0,xmm2
+ ret
+align 16
+__vpaes_schedule_mangle:
+ movdqa xmm4,xmm0
+ movdqa xmm5,[128+ebp]
+ test edi,edi
+ jnz NEAR L$014schedule_mangle_dec
+ add edx,16
+ pxor xmm4,[336+ebp]
+db 102,15,56,0,229
+ movdqa xmm3,xmm4
+db 102,15,56,0,229
+ pxor xmm3,xmm4
+db 102,15,56,0,229
+ pxor xmm3,xmm4
+ jmp NEAR L$015schedule_mangle_both
+align 16
+L$014schedule_mangle_dec:
+ movdqa xmm2,[ebp-16]
+ lea esi,[416+ebp]
+ movdqa xmm1,xmm2
+ pandn xmm1,xmm4
+ psrld xmm1,4
+ pand xmm4,xmm2
+ movdqa xmm2,[esi]
+db 102,15,56,0,212
+ movdqa xmm3,[16+esi]
+db 102,15,56,0,217
+ pxor xmm3,xmm2
+db 102,15,56,0,221
+ movdqa xmm2,[32+esi]
+db 102,15,56,0,212
+ pxor xmm2,xmm3
+ movdqa xmm3,[48+esi]
+db 102,15,56,0,217
+ pxor xmm3,xmm2
+db 102,15,56,0,221
+ movdqa xmm2,[64+esi]
+db 102,15,56,0,212
+ pxor xmm2,xmm3
+ movdqa xmm3,[80+esi]
+db 102,15,56,0,217
+ pxor xmm3,xmm2
+db 102,15,56,0,221
+ movdqa xmm2,[96+esi]
+db 102,15,56,0,212
+ pxor xmm2,xmm3
+ movdqa xmm3,[112+esi]
+db 102,15,56,0,217
+ pxor xmm3,xmm2
+ add edx,-16
+L$015schedule_mangle_both:
+ movdqa xmm1,[256+ecx*1+ebp]
+db 102,15,56,0,217
+ add ecx,-16
+ and ecx,48
+ movdqu [edx],xmm3
+ ret
+global _vpaes_set_encrypt_key
+align 16
+_vpaes_set_encrypt_key:
+L$_vpaes_set_encrypt_key_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+%ifdef BORINGSSL_DISPATCH_TEST
+ push ebx
+ push edx
+ call L$016pic
+L$016pic:
+ pop ebx
+ lea ebx,[(_BORINGSSL_function_hit+5-L$016pic)+ebx]
+ mov edx,1
+ mov BYTE [ebx],dl
+ pop edx
+ pop ebx
+%endif
+ mov esi,DWORD [20+esp]
+ lea ebx,[esp-56]
+ mov eax,DWORD [24+esp]
+ and ebx,-16
+ mov edx,DWORD [28+esp]
+ xchg ebx,esp
+ mov DWORD [48+esp],ebx
+ mov ebx,eax
+ shr ebx,5
+ add ebx,5
+ mov DWORD [240+edx],ebx
+ mov ecx,48
+ mov edi,0
+ lea ebp,[(L$_vpaes_consts+0x30-L$017pic_point)]
+ call __vpaes_schedule_core
+L$017pic_point:
+ mov esp,DWORD [48+esp]
+ xor eax,eax
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _vpaes_set_decrypt_key
+align 16
+_vpaes_set_decrypt_key:
+L$_vpaes_set_decrypt_key_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ lea ebx,[esp-56]
+ mov eax,DWORD [24+esp]
+ and ebx,-16
+ mov edx,DWORD [28+esp]
+ xchg ebx,esp
+ mov DWORD [48+esp],ebx
+ mov ebx,eax
+ shr ebx,5
+ add ebx,5
+ mov DWORD [240+edx],ebx
+ shl ebx,4
+ lea edx,[16+ebx*1+edx]
+ mov edi,1
+ mov ecx,eax
+ shr ecx,1
+ and ecx,32
+ xor ecx,32
+ lea ebp,[(L$_vpaes_consts+0x30-L$018pic_point)]
+ call __vpaes_schedule_core
+L$018pic_point:
+ mov esp,DWORD [48+esp]
+ xor eax,eax
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _vpaes_encrypt
+align 16
+_vpaes_encrypt:
+L$_vpaes_encrypt_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+%ifdef BORINGSSL_DISPATCH_TEST
+ push ebx
+ push edx
+ call L$019pic
+L$019pic:
+ pop ebx
+ lea ebx,[(_BORINGSSL_function_hit+4-L$019pic)+ebx]
+ mov edx,1
+ mov BYTE [ebx],dl
+ pop edx
+ pop ebx
+%endif
+ lea ebp,[(L$_vpaes_consts+0x30-L$020pic_point)]
+ call __vpaes_preheat
+L$020pic_point:
+ mov esi,DWORD [20+esp]
+ lea ebx,[esp-56]
+ mov edi,DWORD [24+esp]
+ and ebx,-16
+ mov edx,DWORD [28+esp]
+ xchg ebx,esp
+ mov DWORD [48+esp],ebx
+ movdqu xmm0,[esi]
+ call __vpaes_encrypt_core
+ movdqu [edi],xmm0
+ mov esp,DWORD [48+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _vpaes_decrypt
+align 16
+_vpaes_decrypt:
+L$_vpaes_decrypt_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ lea ebp,[(L$_vpaes_consts+0x30-L$021pic_point)]
+ call __vpaes_preheat
+L$021pic_point:
+ mov esi,DWORD [20+esp]
+ lea ebx,[esp-56]
+ mov edi,DWORD [24+esp]
+ and ebx,-16
+ mov edx,DWORD [28+esp]
+ xchg ebx,esp
+ mov DWORD [48+esp],ebx
+ movdqu xmm0,[esi]
+ call __vpaes_decrypt_core
+ movdqu [edi],xmm0
+ mov esp,DWORD [48+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _vpaes_cbc_encrypt
+align 16
+_vpaes_cbc_encrypt:
+L$_vpaes_cbc_encrypt_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ mov eax,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ sub eax,16
+ jc NEAR L$022cbc_abort
+ lea ebx,[esp-56]
+ mov ebp,DWORD [36+esp]
+ and ebx,-16
+ mov ecx,DWORD [40+esp]
+ xchg ebx,esp
+ movdqu xmm1,[ebp]
+ sub edi,esi
+ mov DWORD [48+esp],ebx
+ mov DWORD [esp],edi
+ mov DWORD [4+esp],edx
+ mov DWORD [8+esp],ebp
+ mov edi,eax
+ lea ebp,[(L$_vpaes_consts+0x30-L$023pic_point)]
+ call __vpaes_preheat
+L$023pic_point:
+ cmp ecx,0
+ je NEAR L$024cbc_dec_loop
+ jmp NEAR L$025cbc_enc_loop
+align 16
+L$025cbc_enc_loop:
+ movdqu xmm0,[esi]
+ pxor xmm0,xmm1
+ call __vpaes_encrypt_core
+ mov ebx,DWORD [esp]
+ mov edx,DWORD [4+esp]
+ movdqa xmm1,xmm0
+ movdqu [esi*1+ebx],xmm0
+ lea esi,[16+esi]
+ sub edi,16
+ jnc NEAR L$025cbc_enc_loop
+ jmp NEAR L$026cbc_done
+align 16
+L$024cbc_dec_loop:
+ movdqu xmm0,[esi]
+ movdqa [16+esp],xmm1
+ movdqa [32+esp],xmm0
+ call __vpaes_decrypt_core
+ mov ebx,DWORD [esp]
+ mov edx,DWORD [4+esp]
+ pxor xmm0,[16+esp]
+ movdqa xmm1,[32+esp]
+ movdqu [esi*1+ebx],xmm0
+ lea esi,[16+esi]
+ sub edi,16
+ jnc NEAR L$024cbc_dec_loop
+L$026cbc_done:
+ mov ebx,DWORD [8+esp]
+ mov esp,DWORD [48+esp]
+ movdqu [ebx],xmm1
+L$022cbc_abort:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/vpaes-x86_64-apple.S b/gen/bcm/vpaes-x86_64-apple.S
new file mode 100644
index 0000000..5aea40f
--- /dev/null
+++ b/gen/bcm/vpaes-x86_64-apple.S
@@ -0,0 +1,1131 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_encrypt_core:
+
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa L$k_ipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%r9),%xmm5
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa L$k_ipt+16(%rip),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ leaq L$k_mc_backward(%rip),%r10
+ jmp L$enc_entry
+
+.p2align 4
+L$enc_loop:
+
+ movdqa %xmm13,%xmm4
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa %xmm15,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
+ movdqa %xmm14,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andq $0x30,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+
+L$enc_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
+ pxor %xmm1,%xmm3
+ jnz L$enc_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%r11,%r10,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_encrypt_core_2x:
+
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa %xmm9,%xmm7
+ movdqa L$k_ipt(%rip),%xmm2
+ movdqa %xmm2,%xmm8
+ pandn %xmm0,%xmm1
+ pandn %xmm6,%xmm7
+ movdqu (%r9),%xmm5
+
+ psrld $4,%xmm1
+ psrld $4,%xmm7
+ pand %xmm9,%xmm0
+ pand %xmm9,%xmm6
+.byte 102,15,56,0,208
+.byte 102,68,15,56,0,198
+ movdqa L$k_ipt+16(%rip),%xmm0
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,247
+ pxor %xmm5,%xmm2
+ pxor %xmm5,%xmm8
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ pxor %xmm8,%xmm6
+ leaq L$k_mc_backward(%rip),%r10
+ jmp L$enc2x_entry
+
+.p2align 4
+L$enc2x_loop:
+
+ movdqa L$k_sb1(%rip),%xmm4
+ movdqa L$k_sb1+16(%rip),%xmm0
+ movdqa %xmm4,%xmm12
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,226
+.byte 102,69,15,56,0,224
+.byte 102,15,56,0,195
+.byte 102,65,15,56,0,243
+ pxor %xmm5,%xmm4
+ pxor %xmm5,%xmm12
+ movdqa L$k_sb2(%rip),%xmm5
+ movdqa %xmm5,%xmm13
+ pxor %xmm4,%xmm0
+ pxor %xmm12,%xmm6
+ movdqa -64(%r11,%r10,1),%xmm1
+
+.byte 102,15,56,0,234
+.byte 102,69,15,56,0,232
+ movdqa (%r11,%r10,1),%xmm4
+
+ movdqa L$k_sb2+16(%rip),%xmm2
+ movdqa %xmm2,%xmm8
+.byte 102,15,56,0,211
+.byte 102,69,15,56,0,195
+ movdqa %xmm0,%xmm3
+ movdqa %xmm6,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm13,%xmm8
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,220
+.byte 102,68,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm11
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ andq $0x30,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+ pxor %xmm11,%xmm6
+
+L$enc2x_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm9,%xmm7
+ movdqa L$k_inv+16(%rip),%xmm5
+ movdqa %xmm5,%xmm13
+ pandn %xmm0,%xmm1
+ pandn %xmm6,%xmm7
+ psrld $4,%xmm1
+ psrld $4,%xmm7
+ pand %xmm9,%xmm0
+ pand %xmm9,%xmm6
+.byte 102,15,56,0,232
+.byte 102,68,15,56,0,238
+ movdqa %xmm10,%xmm3
+ movdqa %xmm10,%xmm11
+ pxor %xmm1,%xmm0
+ pxor %xmm7,%xmm6
+.byte 102,15,56,0,217
+.byte 102,68,15,56,0,223
+ movdqa %xmm10,%xmm4
+ movdqa %xmm10,%xmm12
+ pxor %xmm5,%xmm3
+ pxor %xmm13,%xmm11
+.byte 102,15,56,0,224
+.byte 102,68,15,56,0,230
+ movdqa %xmm10,%xmm2
+ movdqa %xmm10,%xmm8
+ pxor %xmm5,%xmm4
+ pxor %xmm13,%xmm12
+.byte 102,15,56,0,211
+.byte 102,69,15,56,0,195
+ movdqa %xmm10,%xmm3
+ movdqa %xmm10,%xmm11
+ pxor %xmm0,%xmm2
+ pxor %xmm6,%xmm8
+.byte 102,15,56,0,220
+.byte 102,69,15,56,0,220
+ movdqu (%r9),%xmm5
+
+ pxor %xmm1,%xmm3
+ pxor %xmm7,%xmm11
+ jnz L$enc2x_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+ movdqa %xmm4,%xmm12
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,226
+.byte 102,69,15,56,0,224
+ pxor %xmm5,%xmm4
+ pxor %xmm5,%xmm12
+.byte 102,15,56,0,195
+.byte 102,65,15,56,0,243
+ movdqa 64(%r11,%r10,1),%xmm1
+
+ pxor %xmm4,%xmm0
+ pxor %xmm12,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ ret
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_decrypt_core:
+
+ movq %rdx,%r9
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa L$k_dipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movq %rax,%r11
+ psrld $4,%xmm1
+ movdqu (%r9),%xmm5
+ shlq $4,%r11
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa L$k_dipt+16(%rip),%xmm0
+ xorq $0x30,%r11
+ leaq L$k_dsbd(%rip),%r10
+.byte 102,15,56,0,193
+ andq $0x30,%r11
+ pxor %xmm5,%xmm2
+ movdqa L$k_mc_forward+48(%rip),%xmm5
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ addq %r10,%r11
+ jmp L$dec_entry
+
+.p2align 4
+L$dec_loop:
+
+
+
+ movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 0(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addq $16,%r9
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
+
+L$dec_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
+ jnz L$dec_loop
+
+
+ movdqa 96(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%r10),%xmm0
+ movdqa -352(%r11),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_core:
+
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa L$k_rcon(%rip),%xmm8
+ movdqu (%rdi),%xmm0
+
+
+ movdqa %xmm0,%xmm3
+ leaq L$k_ipt(%rip),%r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+
+ leaq L$k_sr(%rip),%r10
+ testq %rcx,%rcx
+ jnz L$schedule_am_decrypting
+
+
+ movdqu %xmm0,(%rdx)
+ jmp L$schedule_go
+
+L$schedule_am_decrypting:
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%rdx)
+ xorq $0x30,%r8
+
+L$schedule_go:
+ cmpl $192,%esi
+ ja L$schedule_256
+ je L$schedule_192
+
+
+
+
+
+
+
+
+
+
+L$schedule_128:
+ movl $10,%esi
+
+L$oop_schedule_128:
+ call _vpaes_schedule_round
+ decq %rsi
+ jz L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+L$schedule_192:
+ movdqu 8(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%esi
+
+L$oop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decq %rsi
+ jz L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp L$oop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+L$schedule_256:
+ movdqu 16(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%esi
+
+L$oop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+
+
+ call _vpaes_schedule_round
+ decq %rsi
+ jz L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd $0xFF,%xmm0,%xmm0
+ movdqa %xmm7,%xmm5
+ movdqa %xmm6,%xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5,%xmm7
+
+ jmp L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+L$schedule_mangle_last:
+
+ leaq L$k_deskew(%rip),%r11
+ testq %rcx,%rcx
+ jnz L$schedule_mangle_last_dec
+
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,193
+ leaq L$k_opt(%rip),%r11
+ addq $32,%rdx
+
+L$schedule_mangle_last_dec:
+ addq $-16,%rdx
+ pxor L$k_s63(%rip),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%rdx)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_192_smear:
+
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_round:
+
+
+ pxor %xmm1,%xmm1
+.byte 102,65,15,58,15,200,15
+.byte 102,69,15,58,15,192,15
+ pxor %xmm1,%xmm7
+
+
+ pshufd $0xFF,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor L$k_s63(%rip),%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_transform:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa (%r11),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%r11),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_mangle:
+
+ movdqa %xmm0,%xmm4
+ movdqa L$k_mc_forward(%rip),%xmm5
+ testq %rcx,%rcx
+ jnz L$schedule_mangle_dec
+
+
+ addq $16,%rdx
+ pxor L$k_s63(%rip),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+
+ jmp L$schedule_mangle_both
+.p2align 4
+L$schedule_mangle_dec:
+
+ leaq L$k_dksd(%rip),%r11
+ movdqa %xmm9,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm4
+
+ movdqa 0(%r11),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 32(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 64(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 96(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+
+ addq $-16,%rdx
+
+L$schedule_mangle_both:
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ addq $-16,%r8
+ andq $0x30,%r8
+ movdqu %xmm3,(%rdx)
+ ret
+
+
+
+
+
+
+.globl _vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+
+.p2align 4
+_vpaes_set_encrypt_key:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+ movb $1,_BORINGSSL_function_hit+5(%rip)
+#endif
+
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+
+ movl $0,%ecx
+ movl $0x30,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ ret
+
+
+
+.globl _vpaes_set_decrypt_key
+.private_extern _vpaes_set_decrypt_key
+
+.p2align 4
+_vpaes_set_decrypt_key:
+
+_CET_ENDBR
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+ shll $4,%eax
+ leaq 16(%rdx,%rax,1),%rdx
+
+ movl $1,%ecx
+ movl %esi,%r8d
+ shrl $1,%r8d
+ andl $32,%r8d
+ xorl $32,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ ret
+
+
+
+.globl _vpaes_encrypt
+.private_extern _vpaes_encrypt
+
+.p2align 4
+_vpaes_encrypt:
+
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+
+ movb $1,_BORINGSSL_function_hit+4(%rip)
+#endif
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+ ret
+
+
+
+.globl _vpaes_decrypt
+.private_extern _vpaes_decrypt
+
+.p2align 4
+_vpaes_decrypt:
+
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+ ret
+
+
+.globl _vpaes_cbc_encrypt
+.private_extern _vpaes_cbc_encrypt
+
+.p2align 4
+_vpaes_cbc_encrypt:
+
+_CET_ENDBR
+ xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc L$cbc_abort
+ movdqu (%r8),%xmm6
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ cmpl $0,%r9d
+ je L$cbc_dec_loop
+ jmp L$cbc_enc_loop
+.p2align 4
+L$cbc_enc_loop:
+ movdqu (%rdi),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc L$cbc_enc_loop
+ jmp L$cbc_done
+.p2align 4
+L$cbc_dec_loop:
+ movdqu (%rdi),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc L$cbc_dec_loop
+L$cbc_done:
+ movdqu %xmm6,(%r8)
+L$cbc_abort:
+ ret
+
+
+.globl _vpaes_ctr32_encrypt_blocks
+.private_extern _vpaes_ctr32_encrypt_blocks
+
+.p2align 4
+_vpaes_ctr32_encrypt_blocks:
+
+_CET_ENDBR
+
+ xchgq %rcx,%rdx
+ testq %rcx,%rcx
+ jz L$ctr32_abort
+ movdqu (%r8),%xmm0
+ movdqa L$ctr_add_one(%rip),%xmm8
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ movdqa %xmm0,%xmm6
+ pshufb L$rev_ctr(%rip),%xmm6
+
+ testq $1,%rcx
+ jz L$ctr32_prep_loop
+
+
+
+ movdqu (%rdi),%xmm7
+ call _vpaes_encrypt_core
+ pxor %xmm7,%xmm0
+ paddd %xmm8,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ subq $1,%rcx
+ leaq 16(%rdi),%rdi
+ jz L$ctr32_done
+
+L$ctr32_prep_loop:
+
+
+ movdqa %xmm6,%xmm14
+ movdqa %xmm6,%xmm15
+ paddd %xmm8,%xmm15
+
+L$ctr32_loop:
+ movdqa L$rev_ctr(%rip),%xmm1
+ movdqa %xmm14,%xmm0
+ movdqa %xmm15,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ call _vpaes_encrypt_core_2x
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa L$ctr_add_two(%rip),%xmm3
+ pxor %xmm1,%xmm0
+ pxor %xmm2,%xmm6
+ paddd %xmm3,%xmm14
+ paddd %xmm3,%xmm15
+ movdqu %xmm0,(%rsi,%rdi,1)
+ movdqu %xmm6,16(%rsi,%rdi,1)
+ subq $2,%rcx
+ leaq 32(%rdi),%rdi
+ jnz L$ctr32_loop
+
+L$ctr32_done:
+L$ctr32_abort:
+ ret
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_preheat:
+
+ leaq L$k_s0F(%rip),%r10
+ movdqa -32(%r10),%xmm10
+ movdqa -16(%r10),%xmm11
+ movdqa 0(%r10),%xmm9
+ movdqa 48(%r10),%xmm13
+ movdqa 64(%r10),%xmm12
+ movdqa 80(%r10),%xmm15
+ movdqa 96(%r10),%xmm14
+ ret
+
+
+
+
+
+
+
+
+.section __DATA,__const
+.p2align 6
+_vpaes_consts:
+L$k_inv:
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+L$k_s0F:
+.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+L$k_ipt:
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+L$k_sb1:
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+L$k_sb2:
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+L$k_sbo:
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+L$k_mc_forward:
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+L$k_mc_backward:
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+L$k_sr:
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+L$k_rcon:
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+L$k_s63:
+.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+L$k_opt:
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+L$k_deskew:
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+L$k_dksd:
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+L$k_dksb:
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+L$k_dkse:
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+L$k_dks9:
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+L$k_dipt:
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+
+L$k_dsb9:
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+L$k_dsbd:
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+L$k_dsbb:
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+L$k_dsbe:
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+L$k_dsbo:
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+
+L$rev_ctr:
+.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+L$ctr_add_one:
+.quad 0x0000000000000000, 0x0000000100000000
+L$ctr_add_two:
+.quad 0x0000000000000000, 0x0000000200000000
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.p2align 6
+
+.text
+#endif
diff --git a/gen/bcm/vpaes-x86_64-linux.S b/gen/bcm/vpaes-x86_64-linux.S
new file mode 100644
index 0000000..019c638
--- /dev/null
+++ b/gen/bcm/vpaes-x86_64-linux.S
@@ -0,0 +1,1133 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+.cfi_startproc
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_ipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%r9),%xmm5
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_ipt+16(%rip),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ leaq .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+
+ movdqa %xmm13,%xmm4
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa %xmm15,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
+ movdqa %xmm14,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andq $0x30,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+
+.Lenc_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
+ pxor %xmm1,%xmm3
+ jnz .Lenc_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%r11,%r10,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+.cfi_endproc
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_encrypt_core_2x,@function
+.align 16
+_vpaes_encrypt_core_2x:
+.cfi_startproc
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa %xmm9,%xmm7
+ movdqa .Lk_ipt(%rip),%xmm2
+ movdqa %xmm2,%xmm8
+ pandn %xmm0,%xmm1
+ pandn %xmm6,%xmm7
+ movdqu (%r9),%xmm5
+
+ psrld $4,%xmm1
+ psrld $4,%xmm7
+ pand %xmm9,%xmm0
+ pand %xmm9,%xmm6
+.byte 102,15,56,0,208
+.byte 102,68,15,56,0,198
+ movdqa .Lk_ipt+16(%rip),%xmm0
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,247
+ pxor %xmm5,%xmm2
+ pxor %xmm5,%xmm8
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ pxor %xmm8,%xmm6
+ leaq .Lk_mc_backward(%rip),%r10
+ jmp .Lenc2x_entry
+
+.align 16
+.Lenc2x_loop:
+
+ movdqa .Lk_sb1(%rip),%xmm4
+ movdqa .Lk_sb1+16(%rip),%xmm0
+ movdqa %xmm4,%xmm12
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,226
+.byte 102,69,15,56,0,224
+.byte 102,15,56,0,195
+.byte 102,65,15,56,0,243
+ pxor %xmm5,%xmm4
+ pxor %xmm5,%xmm12
+ movdqa .Lk_sb2(%rip),%xmm5
+ movdqa %xmm5,%xmm13
+ pxor %xmm4,%xmm0
+ pxor %xmm12,%xmm6
+ movdqa -64(%r11,%r10,1),%xmm1
+
+.byte 102,15,56,0,234
+.byte 102,69,15,56,0,232
+ movdqa (%r11,%r10,1),%xmm4
+
+ movdqa .Lk_sb2+16(%rip),%xmm2
+ movdqa %xmm2,%xmm8
+.byte 102,15,56,0,211
+.byte 102,69,15,56,0,195
+ movdqa %xmm0,%xmm3
+ movdqa %xmm6,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm13,%xmm8
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,220
+.byte 102,68,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm11
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ andq $0x30,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+ pxor %xmm11,%xmm6
+
+.Lenc2x_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm9,%xmm7
+ movdqa .Lk_inv+16(%rip),%xmm5
+ movdqa %xmm5,%xmm13
+ pandn %xmm0,%xmm1
+ pandn %xmm6,%xmm7
+ psrld $4,%xmm1
+ psrld $4,%xmm7
+ pand %xmm9,%xmm0
+ pand %xmm9,%xmm6
+.byte 102,15,56,0,232
+.byte 102,68,15,56,0,238
+ movdqa %xmm10,%xmm3
+ movdqa %xmm10,%xmm11
+ pxor %xmm1,%xmm0
+ pxor %xmm7,%xmm6
+.byte 102,15,56,0,217
+.byte 102,68,15,56,0,223
+ movdqa %xmm10,%xmm4
+ movdqa %xmm10,%xmm12
+ pxor %xmm5,%xmm3
+ pxor %xmm13,%xmm11
+.byte 102,15,56,0,224
+.byte 102,68,15,56,0,230
+ movdqa %xmm10,%xmm2
+ movdqa %xmm10,%xmm8
+ pxor %xmm5,%xmm4
+ pxor %xmm13,%xmm12
+.byte 102,15,56,0,211
+.byte 102,69,15,56,0,195
+ movdqa %xmm10,%xmm3
+ movdqa %xmm10,%xmm11
+ pxor %xmm0,%xmm2
+ pxor %xmm6,%xmm8
+.byte 102,15,56,0,220
+.byte 102,69,15,56,0,220
+ movdqu (%r9),%xmm5
+
+ pxor %xmm1,%xmm3
+ pxor %xmm7,%xmm11
+ jnz .Lenc2x_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+ movdqa %xmm4,%xmm12
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,226
+.byte 102,69,15,56,0,224
+ pxor %xmm5,%xmm4
+ pxor %xmm5,%xmm12
+.byte 102,15,56,0,195
+.byte 102,65,15,56,0,243
+ movdqa 64(%r11,%r10,1),%xmm1
+
+ pxor %xmm4,%xmm0
+ pxor %xmm12,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ ret
+.cfi_endproc
+.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
+
+
+
+
+
+
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+.cfi_startproc
+ movq %rdx,%r9
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_dipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movq %rax,%r11
+ psrld $4,%xmm1
+ movdqu (%r9),%xmm5
+ shlq $4,%r11
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_dipt+16(%rip),%xmm0
+ xorq $0x30,%r11
+ leaq .Lk_dsbd(%rip),%r10
+.byte 102,15,56,0,193
+ andq $0x30,%r11
+ pxor %xmm5,%xmm2
+ movdqa .Lk_mc_forward+48(%rip),%xmm5
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ addq %r10,%r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+
+
+
+ movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 0(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addq $16,%r9
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
+
+.Ldec_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
+ jnz .Ldec_loop
+
+
+ movdqa 96(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%r10),%xmm0
+ movdqa -352(%r11),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+.cfi_endproc
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+.cfi_startproc
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa .Lk_rcon(%rip),%xmm8
+ movdqu (%rdi),%xmm0
+
+
+ movdqa %xmm0,%xmm3
+ leaq .Lk_ipt(%rip),%r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+
+ leaq .Lk_sr(%rip),%r10
+ testq %rcx,%rcx
+ jnz .Lschedule_am_decrypting
+
+
+ movdqu %xmm0,(%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%rdx)
+ xorq $0x30,%r8
+
+.Lschedule_go:
+ cmpl $192,%esi
+ ja .Lschedule_256
+ je .Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+ movl $10,%esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+
+
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd $0xFF,%xmm0,%xmm0
+ movdqa %xmm7,%xmm5
+ movdqa %xmm6,%xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5,%xmm7
+
+ jmp .Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_mangle_last:
+
+ leaq .Lk_deskew(%rip),%r11
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_last_dec
+
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,193
+ leaq .Lk_opt(%rip),%r11
+ addq $32,%rdx
+
+.Lschedule_mangle_last_dec:
+ addq $-16,%rdx
+ pxor .Lk_s63(%rip),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%rdx)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+.cfi_endproc
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+.cfi_startproc
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ ret
+.cfi_endproc
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+.cfi_startproc
+
+ pxor %xmm1,%xmm1
+.byte 102,65,15,58,15,200,15
+.byte 102,69,15,58,15,192,15
+ pxor %xmm1,%xmm7
+
+
+ pshufd $0xFF,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor .Lk_s63(%rip),%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+.cfi_endproc
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+.cfi_startproc
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa (%r11),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%r11),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+.cfi_endproc
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+.cfi_startproc
+ movdqa %xmm0,%xmm4
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_dec
+
+
+ addq $16,%rdx
+ pxor .Lk_s63(%rip),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+
+ leaq .Lk_dksd(%rip),%r11
+ movdqa %xmm9,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm4
+
+ movdqa 0(%r11),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 32(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 64(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 96(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+
+ addq $-16,%rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ addq $-16,%r8
+ andq $0x30,%r8
+ movdqu %xmm3,(%rdx)
+ ret
+.cfi_endproc
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+.cfi_startproc
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+5(%rip)
+#endif
+
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+
+ movl $0,%ecx
+ movl $0x30,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ ret
+.cfi_endproc
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+.cfi_startproc
+_CET_ENDBR
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+ shll $4,%eax
+ leaq 16(%rdx,%rax,1),%rdx
+
+ movl $1,%ecx
+ movl %esi,%r8d
+ shrl $1,%r8d
+ andl $32,%r8d
+ xorl $32,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ ret
+.cfi_endproc
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+.cfi_startproc
+_CET_ENDBR
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+4(%rip)
+#endif
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+ ret
+.cfi_endproc
+.size vpaes_encrypt,.-vpaes_encrypt
+
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+.cfi_startproc
+_CET_ENDBR
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+ ret
+.cfi_endproc
+.size vpaes_decrypt,.-vpaes_decrypt
+.globl vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+.cfi_startproc
+_CET_ENDBR
+ xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc .Lcbc_abort
+ movdqu (%r8),%xmm6
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ cmpl $0,%r9d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu (%rdi),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu (%rdi),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,(%r8)
+.Lcbc_abort:
+ ret
+.cfi_endproc
+.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+.globl vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type vpaes_ctr32_encrypt_blocks,@function
+.align 16
+vpaes_ctr32_encrypt_blocks:
+.cfi_startproc
+_CET_ENDBR
+
+ xchgq %rcx,%rdx
+ testq %rcx,%rcx
+ jz .Lctr32_abort
+ movdqu (%r8),%xmm0
+ movdqa .Lctr_add_one(%rip),%xmm8
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ movdqa %xmm0,%xmm6
+ pshufb .Lrev_ctr(%rip),%xmm6
+
+ testq $1,%rcx
+ jz .Lctr32_prep_loop
+
+
+
+ movdqu (%rdi),%xmm7
+ call _vpaes_encrypt_core
+ pxor %xmm7,%xmm0
+ paddd %xmm8,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ subq $1,%rcx
+ leaq 16(%rdi),%rdi
+ jz .Lctr32_done
+
+.Lctr32_prep_loop:
+
+
+ movdqa %xmm6,%xmm14
+ movdqa %xmm6,%xmm15
+ paddd %xmm8,%xmm15
+
+.Lctr32_loop:
+ movdqa .Lrev_ctr(%rip),%xmm1
+ movdqa %xmm14,%xmm0
+ movdqa %xmm15,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ call _vpaes_encrypt_core_2x
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa .Lctr_add_two(%rip),%xmm3
+ pxor %xmm1,%xmm0
+ pxor %xmm2,%xmm6
+ paddd %xmm3,%xmm14
+ paddd %xmm3,%xmm15
+ movdqu %xmm0,(%rsi,%rdi,1)
+ movdqu %xmm6,16(%rsi,%rdi,1)
+ subq $2,%rcx
+ leaq 32(%rdi),%rdi
+ jnz .Lctr32_loop
+
+.Lctr32_done:
+.Lctr32_abort:
+ ret
+.cfi_endproc
+.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+
+
+
+
+
+
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+.cfi_startproc
+ leaq .Lk_s0F(%rip),%r10
+ movdqa -32(%r10),%xmm10
+ movdqa -16(%r10),%xmm11
+ movdqa 0(%r10),%xmm9
+ movdqa 48(%r10),%xmm13
+ movdqa 64(%r10),%xmm12
+ movdqa 80(%r10),%xmm15
+ movdqa 96(%r10),%xmm14
+ ret
+.cfi_endproc
+.size _vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type _vpaes_consts,@object
+.section .rodata
+.align 64
+_vpaes_consts:
+.Lk_inv:
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+
+.Lrev_ctr:
+.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+.Lctr_add_one:
+.quad 0x0000000000000000, 0x0000000100000000
+.Lctr_add_two:
+.quad 0x0000000000000000, 0x0000000200000000
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
+.text
+#endif
diff --git a/gen/bcm/vpaes-x86_64-win.asm b/gen/bcm/vpaes-x86_64-win.asm
new file mode 100644
index 0000000..ddbfb12
--- /dev/null
+++ b/gen/bcm/vpaes-x86_64-win.asm
@@ -0,0 +1,1487 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_encrypt_core:
+
+ mov r9,rdx
+ mov r11,16
+ mov eax,DWORD[240+rdx]
+ movdqa xmm1,xmm9
+ movdqa xmm2,XMMWORD[$L$k_ipt]
+ pandn xmm1,xmm0
+ movdqu xmm5,XMMWORD[r9]
+ psrld xmm1,4
+ pand xmm0,xmm9
+DB 102,15,56,0,208
+ movdqa xmm0,XMMWORD[(($L$k_ipt+16))]
+DB 102,15,56,0,193
+ pxor xmm2,xmm5
+ add r9,16
+ pxor xmm0,xmm2
+ lea r10,[$L$k_mc_backward]
+ jmp NEAR $L$enc_entry
+
+ALIGN 16
+$L$enc_loop:
+
+ movdqa xmm4,xmm13
+ movdqa xmm0,xmm12
+DB 102,15,56,0,226
+DB 102,15,56,0,195
+ pxor xmm4,xmm5
+ movdqa xmm5,xmm15
+ pxor xmm0,xmm4
+ movdqa xmm1,XMMWORD[((-64))+r10*1+r11]
+DB 102,15,56,0,234
+ movdqa xmm4,XMMWORD[r10*1+r11]
+ movdqa xmm2,xmm14
+DB 102,15,56,0,211
+ movdqa xmm3,xmm0
+ pxor xmm2,xmm5
+DB 102,15,56,0,193
+ add r9,16
+ pxor xmm0,xmm2
+DB 102,15,56,0,220
+ add r11,16
+ pxor xmm3,xmm0
+DB 102,15,56,0,193
+ and r11,0x30
+ sub rax,1
+ pxor xmm0,xmm3
+
+$L$enc_entry:
+
+ movdqa xmm1,xmm9
+ movdqa xmm5,xmm11
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm9
+DB 102,15,56,0,232
+ movdqa xmm3,xmm10
+ pxor xmm0,xmm1
+DB 102,15,56,0,217
+ movdqa xmm4,xmm10
+ pxor xmm3,xmm5
+DB 102,15,56,0,224
+ movdqa xmm2,xmm10
+ pxor xmm4,xmm5
+DB 102,15,56,0,211
+ movdqa xmm3,xmm10
+ pxor xmm2,xmm0
+DB 102,15,56,0,220
+ movdqu xmm5,XMMWORD[r9]
+ pxor xmm3,xmm1
+ jnz NEAR $L$enc_loop
+
+
+ movdqa xmm4,XMMWORD[((-96))+r10]
+ movdqa xmm0,XMMWORD[((-80))+r10]
+DB 102,15,56,0,226
+ pxor xmm4,xmm5
+DB 102,15,56,0,195
+ movdqa xmm1,XMMWORD[64+r10*1+r11]
+ pxor xmm0,xmm4
+DB 102,15,56,0,193
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_encrypt_core_2x:
+
+ mov r9,rdx
+ mov r11,16
+ mov eax,DWORD[240+rdx]
+ movdqa xmm1,xmm9
+ movdqa xmm7,xmm9
+ movdqa xmm2,XMMWORD[$L$k_ipt]
+ movdqa xmm8,xmm2
+ pandn xmm1,xmm0
+ pandn xmm7,xmm6
+ movdqu xmm5,XMMWORD[r9]
+
+ psrld xmm1,4
+ psrld xmm7,4
+ pand xmm0,xmm9
+ pand xmm6,xmm9
+DB 102,15,56,0,208
+DB 102,68,15,56,0,198
+ movdqa xmm0,XMMWORD[(($L$k_ipt+16))]
+ movdqa xmm6,xmm0
+DB 102,15,56,0,193
+DB 102,15,56,0,247
+ pxor xmm2,xmm5
+ pxor xmm8,xmm5
+ add r9,16
+ pxor xmm0,xmm2
+ pxor xmm6,xmm8
+ lea r10,[$L$k_mc_backward]
+ jmp NEAR $L$enc2x_entry
+
+ALIGN 16
+$L$enc2x_loop:
+
+ movdqa xmm4,XMMWORD[$L$k_sb1]
+ movdqa xmm0,XMMWORD[(($L$k_sb1+16))]
+ movdqa xmm12,xmm4
+ movdqa xmm6,xmm0
+DB 102,15,56,0,226
+DB 102,69,15,56,0,224
+DB 102,15,56,0,195
+DB 102,65,15,56,0,243
+ pxor xmm4,xmm5
+ pxor xmm12,xmm5
+ movdqa xmm5,XMMWORD[$L$k_sb2]
+ movdqa xmm13,xmm5
+ pxor xmm0,xmm4
+ pxor xmm6,xmm12
+ movdqa xmm1,XMMWORD[((-64))+r10*1+r11]
+
+DB 102,15,56,0,234
+DB 102,69,15,56,0,232
+ movdqa xmm4,XMMWORD[r10*1+r11]
+
+ movdqa xmm2,XMMWORD[(($L$k_sb2+16))]
+ movdqa xmm8,xmm2
+DB 102,15,56,0,211
+DB 102,69,15,56,0,195
+ movdqa xmm3,xmm0
+ movdqa xmm11,xmm6
+ pxor xmm2,xmm5
+ pxor xmm8,xmm13
+DB 102,15,56,0,193
+DB 102,15,56,0,241
+ add r9,16
+ pxor xmm0,xmm2
+ pxor xmm6,xmm8
+DB 102,15,56,0,220
+DB 102,68,15,56,0,220
+ add r11,16
+ pxor xmm3,xmm0
+ pxor xmm11,xmm6
+DB 102,15,56,0,193
+DB 102,15,56,0,241
+ and r11,0x30
+ sub rax,1
+ pxor xmm0,xmm3
+ pxor xmm6,xmm11
+
+$L$enc2x_entry:
+
+ movdqa xmm1,xmm9
+ movdqa xmm7,xmm9
+ movdqa xmm5,XMMWORD[(($L$k_inv+16))]
+ movdqa xmm13,xmm5
+ pandn xmm1,xmm0
+ pandn xmm7,xmm6
+ psrld xmm1,4
+ psrld xmm7,4
+ pand xmm0,xmm9
+ pand xmm6,xmm9
+DB 102,15,56,0,232
+DB 102,68,15,56,0,238
+ movdqa xmm3,xmm10
+ movdqa xmm11,xmm10
+ pxor xmm0,xmm1
+ pxor xmm6,xmm7
+DB 102,15,56,0,217
+DB 102,68,15,56,0,223
+ movdqa xmm4,xmm10
+ movdqa xmm12,xmm10
+ pxor xmm3,xmm5
+ pxor xmm11,xmm13
+DB 102,15,56,0,224
+DB 102,68,15,56,0,230
+ movdqa xmm2,xmm10
+ movdqa xmm8,xmm10
+ pxor xmm4,xmm5
+ pxor xmm12,xmm13
+DB 102,15,56,0,211
+DB 102,69,15,56,0,195
+ movdqa xmm3,xmm10
+ movdqa xmm11,xmm10
+ pxor xmm2,xmm0
+ pxor xmm8,xmm6
+DB 102,15,56,0,220
+DB 102,69,15,56,0,220
+ movdqu xmm5,XMMWORD[r9]
+
+ pxor xmm3,xmm1
+ pxor xmm11,xmm7
+ jnz NEAR $L$enc2x_loop
+
+
+ movdqa xmm4,XMMWORD[((-96))+r10]
+ movdqa xmm0,XMMWORD[((-80))+r10]
+ movdqa xmm12,xmm4
+ movdqa xmm6,xmm0
+DB 102,15,56,0,226
+DB 102,69,15,56,0,224
+ pxor xmm4,xmm5
+ pxor xmm12,xmm5
+DB 102,15,56,0,195
+DB 102,65,15,56,0,243
+ movdqa xmm1,XMMWORD[64+r10*1+r11]
+
+ pxor xmm0,xmm4
+ pxor xmm6,xmm12
+DB 102,15,56,0,193
+DB 102,15,56,0,241
+ ret
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_decrypt_core:
+
+ mov r9,rdx
+ mov eax,DWORD[240+rdx]
+ movdqa xmm1,xmm9
+ movdqa xmm2,XMMWORD[$L$k_dipt]
+ pandn xmm1,xmm0
+ mov r11,rax
+ psrld xmm1,4
+ movdqu xmm5,XMMWORD[r9]
+ shl r11,4
+ pand xmm0,xmm9
+DB 102,15,56,0,208
+ movdqa xmm0,XMMWORD[(($L$k_dipt+16))]
+ xor r11,0x30
+ lea r10,[$L$k_dsbd]
+DB 102,15,56,0,193
+ and r11,0x30
+ pxor xmm2,xmm5
+ movdqa xmm5,XMMWORD[(($L$k_mc_forward+48))]
+ pxor xmm0,xmm2
+ add r9,16
+ add r11,r10
+ jmp NEAR $L$dec_entry
+
+ALIGN 16
+$L$dec_loop:
+
+
+
+ movdqa xmm4,XMMWORD[((-32))+r10]
+ movdqa xmm1,XMMWORD[((-16))+r10]
+DB 102,15,56,0,226
+DB 102,15,56,0,203
+ pxor xmm0,xmm4
+ movdqa xmm4,XMMWORD[r10]
+ pxor xmm0,xmm1
+ movdqa xmm1,XMMWORD[16+r10]
+
+DB 102,15,56,0,226
+DB 102,15,56,0,197
+DB 102,15,56,0,203
+ pxor xmm0,xmm4
+ movdqa xmm4,XMMWORD[32+r10]
+ pxor xmm0,xmm1
+ movdqa xmm1,XMMWORD[48+r10]
+
+DB 102,15,56,0,226
+DB 102,15,56,0,197
+DB 102,15,56,0,203
+ pxor xmm0,xmm4
+ movdqa xmm4,XMMWORD[64+r10]
+ pxor xmm0,xmm1
+ movdqa xmm1,XMMWORD[80+r10]
+
+DB 102,15,56,0,226
+DB 102,15,56,0,197
+DB 102,15,56,0,203
+ pxor xmm0,xmm4
+ add r9,16
+DB 102,15,58,15,237,12
+ pxor xmm0,xmm1
+ sub rax,1
+
+$L$dec_entry:
+
+ movdqa xmm1,xmm9
+ pandn xmm1,xmm0
+ movdqa xmm2,xmm11
+ psrld xmm1,4
+ pand xmm0,xmm9
+DB 102,15,56,0,208
+ movdqa xmm3,xmm10
+ pxor xmm0,xmm1
+DB 102,15,56,0,217
+ movdqa xmm4,xmm10
+ pxor xmm3,xmm2
+DB 102,15,56,0,224
+ pxor xmm4,xmm2
+ movdqa xmm2,xmm10
+DB 102,15,56,0,211
+ movdqa xmm3,xmm10
+ pxor xmm2,xmm0
+DB 102,15,56,0,220
+ movdqu xmm0,XMMWORD[r9]
+ pxor xmm3,xmm1
+ jnz NEAR $L$dec_loop
+
+
+ movdqa xmm4,XMMWORD[96+r10]
+DB 102,15,56,0,226
+ pxor xmm4,xmm0
+ movdqa xmm0,XMMWORD[112+r10]
+ movdqa xmm2,XMMWORD[((-352))+r11]
+DB 102,15,56,0,195
+ pxor xmm0,xmm4
+DB 102,15,56,0,194
+ ret
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_schedule_core:
+
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa xmm8,XMMWORD[$L$k_rcon]
+ movdqu xmm0,XMMWORD[rdi]
+
+
+ movdqa xmm3,xmm0
+ lea r11,[$L$k_ipt]
+ call _vpaes_schedule_transform
+ movdqa xmm7,xmm0
+
+ lea r10,[$L$k_sr]
+ test rcx,rcx
+ jnz NEAR $L$schedule_am_decrypting
+
+
+ movdqu XMMWORD[rdx],xmm0
+ jmp NEAR $L$schedule_go
+
+$L$schedule_am_decrypting:
+
+ movdqa xmm1,XMMWORD[r10*1+r8]
+DB 102,15,56,0,217
+ movdqu XMMWORD[rdx],xmm3
+ xor r8,0x30
+
+$L$schedule_go:
+ cmp esi,192
+ ja NEAR $L$schedule_256
+ je NEAR $L$schedule_192
+
+
+
+
+
+
+
+
+
+
+$L$schedule_128:
+ mov esi,10
+
+$L$oop_schedule_128:
+ call _vpaes_schedule_round
+ dec rsi
+ jz NEAR $L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp NEAR $L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+$L$schedule_192:
+ movdqu xmm0,XMMWORD[8+rdi]
+ call _vpaes_schedule_transform
+ movdqa xmm6,xmm0
+ pxor xmm4,xmm4
+ movhlps xmm6,xmm4
+ mov esi,4
+
+$L$oop_schedule_192:
+ call _vpaes_schedule_round
+DB 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ dec rsi
+ jz NEAR $L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp NEAR $L$oop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+$L$schedule_256:
+ movdqu xmm0,XMMWORD[16+rdi]
+ call _vpaes_schedule_transform
+ mov esi,7
+
+$L$oop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa xmm6,xmm0
+
+
+ call _vpaes_schedule_round
+ dec rsi
+ jz NEAR $L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd xmm0,xmm0,0xFF
+ movdqa xmm5,xmm7
+ movdqa xmm7,xmm6
+ call _vpaes_schedule_low_round
+ movdqa xmm7,xmm5
+
+ jmp NEAR $L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+$L$schedule_mangle_last:
+
+ lea r11,[$L$k_deskew]
+ test rcx,rcx
+ jnz NEAR $L$schedule_mangle_last_dec
+
+
+ movdqa xmm1,XMMWORD[r10*1+r8]
+DB 102,15,56,0,193
+ lea r11,[$L$k_opt]
+ add rdx,32
+
+$L$schedule_mangle_last_dec:
+ add rdx,-16
+ pxor xmm0,XMMWORD[$L$k_s63]
+ call _vpaes_schedule_transform
+ movdqu XMMWORD[rdx],xmm0
+
+
+ pxor xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ pxor xmm6,xmm6
+ pxor xmm7,xmm7
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_schedule_192_smear:
+
+ pshufd xmm1,xmm6,0x80
+ pshufd xmm0,xmm7,0xFE
+ pxor xmm6,xmm1
+ pxor xmm1,xmm1
+ pxor xmm6,xmm0
+ movdqa xmm0,xmm6
+ movhlps xmm6,xmm1
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_schedule_round:
+
+
+ pxor xmm1,xmm1
+DB 102,65,15,58,15,200,15
+DB 102,69,15,58,15,192,15
+ pxor xmm7,xmm1
+
+
+ pshufd xmm0,xmm0,0xFF
+DB 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa xmm1,xmm7
+ pslldq xmm7,4
+ pxor xmm7,xmm1
+ movdqa xmm1,xmm7
+ pslldq xmm7,8
+ pxor xmm7,xmm1
+ pxor xmm7,XMMWORD[$L$k_s63]
+
+
+ movdqa xmm1,xmm9
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm9
+ movdqa xmm2,xmm11
+DB 102,15,56,0,208
+ pxor xmm0,xmm1
+ movdqa xmm3,xmm10
+DB 102,15,56,0,217
+ pxor xmm3,xmm2
+ movdqa xmm4,xmm10
+DB 102,15,56,0,224
+ pxor xmm4,xmm2
+ movdqa xmm2,xmm10
+DB 102,15,56,0,211
+ pxor xmm2,xmm0
+ movdqa xmm3,xmm10
+DB 102,15,56,0,220
+ pxor xmm3,xmm1
+ movdqa xmm4,xmm13
+DB 102,15,56,0,226
+ movdqa xmm0,xmm12
+DB 102,15,56,0,195
+ pxor xmm0,xmm4
+
+
+ pxor xmm0,xmm7
+ movdqa xmm7,xmm0
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_schedule_transform:
+
+ movdqa xmm1,xmm9
+ pandn xmm1,xmm0
+ psrld xmm1,4
+ pand xmm0,xmm9
+ movdqa xmm2,XMMWORD[r11]
+DB 102,15,56,0,208
+ movdqa xmm0,XMMWORD[16+r11]
+DB 102,15,56,0,193
+ pxor xmm0,xmm2
+ ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_schedule_mangle:
+
+ movdqa xmm4,xmm0
+ movdqa xmm5,XMMWORD[$L$k_mc_forward]
+ test rcx,rcx
+ jnz NEAR $L$schedule_mangle_dec
+
+
+ add rdx,16
+ pxor xmm4,XMMWORD[$L$k_s63]
+DB 102,15,56,0,229
+ movdqa xmm3,xmm4
+DB 102,15,56,0,229
+ pxor xmm3,xmm4
+DB 102,15,56,0,229
+ pxor xmm3,xmm4
+
+ jmp NEAR $L$schedule_mangle_both
+ALIGN 16
+$L$schedule_mangle_dec:
+
+ lea r11,[$L$k_dksd]
+ movdqa xmm1,xmm9
+ pandn xmm1,xmm4
+ psrld xmm1,4
+ pand xmm4,xmm9
+
+ movdqa xmm2,XMMWORD[r11]
+DB 102,15,56,0,212
+ movdqa xmm3,XMMWORD[16+r11]
+DB 102,15,56,0,217
+ pxor xmm3,xmm2
+DB 102,15,56,0,221
+
+ movdqa xmm2,XMMWORD[32+r11]
+DB 102,15,56,0,212
+ pxor xmm2,xmm3
+ movdqa xmm3,XMMWORD[48+r11]
+DB 102,15,56,0,217
+ pxor xmm3,xmm2
+DB 102,15,56,0,221
+
+ movdqa xmm2,XMMWORD[64+r11]
+DB 102,15,56,0,212
+ pxor xmm2,xmm3
+ movdqa xmm3,XMMWORD[80+r11]
+DB 102,15,56,0,217
+ pxor xmm3,xmm2
+DB 102,15,56,0,221
+
+ movdqa xmm2,XMMWORD[96+r11]
+DB 102,15,56,0,212
+ pxor xmm2,xmm3
+ movdqa xmm3,XMMWORD[112+r11]
+DB 102,15,56,0,217
+ pxor xmm3,xmm2
+
+ add rdx,-16
+
+$L$schedule_mangle_both:
+ movdqa xmm1,XMMWORD[r10*1+r8]
+DB 102,15,56,0,217
+ add r8,-16
+ and r8,0x30
+ movdqu XMMWORD[rdx],xmm3
+ ret
+
+
+
+
+
+
+global vpaes_set_encrypt_key
+
+ALIGN 16
+vpaes_set_encrypt_key:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_vpaes_set_encrypt_key:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN BORINGSSL_function_hit
+ mov BYTE[((BORINGSSL_function_hit+5))],1
+%endif
+
+ lea rsp,[((-184))+rsp]
+ movaps XMMWORD[16+rsp],xmm6
+ movaps XMMWORD[32+rsp],xmm7
+ movaps XMMWORD[48+rsp],xmm8
+ movaps XMMWORD[64+rsp],xmm9
+ movaps XMMWORD[80+rsp],xmm10
+ movaps XMMWORD[96+rsp],xmm11
+ movaps XMMWORD[112+rsp],xmm12
+ movaps XMMWORD[128+rsp],xmm13
+ movaps XMMWORD[144+rsp],xmm14
+ movaps XMMWORD[160+rsp],xmm15
+$L$enc_key_body:
+ mov eax,esi
+ shr eax,5
+ add eax,5
+ mov DWORD[240+rdx],eax
+
+ mov ecx,0
+ mov r8d,0x30
+ call _vpaes_schedule_core
+ movaps xmm6,XMMWORD[16+rsp]
+ movaps xmm7,XMMWORD[32+rsp]
+ movaps xmm8,XMMWORD[48+rsp]
+ movaps xmm9,XMMWORD[64+rsp]
+ movaps xmm10,XMMWORD[80+rsp]
+ movaps xmm11,XMMWORD[96+rsp]
+ movaps xmm12,XMMWORD[112+rsp]
+ movaps xmm13,XMMWORD[128+rsp]
+ movaps xmm14,XMMWORD[144+rsp]
+ movaps xmm15,XMMWORD[160+rsp]
+ lea rsp,[184+rsp]
+$L$enc_key_epilogue:
+ xor eax,eax
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_vpaes_set_encrypt_key:
+
+global vpaes_set_decrypt_key
+
+ALIGN 16
+vpaes_set_decrypt_key:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_vpaes_set_decrypt_key:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rsp,[((-184))+rsp]
+ movaps XMMWORD[16+rsp],xmm6
+ movaps XMMWORD[32+rsp],xmm7
+ movaps XMMWORD[48+rsp],xmm8
+ movaps XMMWORD[64+rsp],xmm9
+ movaps XMMWORD[80+rsp],xmm10
+ movaps XMMWORD[96+rsp],xmm11
+ movaps XMMWORD[112+rsp],xmm12
+ movaps XMMWORD[128+rsp],xmm13
+ movaps XMMWORD[144+rsp],xmm14
+ movaps XMMWORD[160+rsp],xmm15
+$L$dec_key_body:
+ mov eax,esi
+ shr eax,5
+ add eax,5
+ mov DWORD[240+rdx],eax
+ shl eax,4
+ lea rdx,[16+rax*1+rdx]
+
+ mov ecx,1
+ mov r8d,esi
+ shr r8d,1
+ and r8d,32
+ xor r8d,32
+ call _vpaes_schedule_core
+ movaps xmm6,XMMWORD[16+rsp]
+ movaps xmm7,XMMWORD[32+rsp]
+ movaps xmm8,XMMWORD[48+rsp]
+ movaps xmm9,XMMWORD[64+rsp]
+ movaps xmm10,XMMWORD[80+rsp]
+ movaps xmm11,XMMWORD[96+rsp]
+ movaps xmm12,XMMWORD[112+rsp]
+ movaps xmm13,XMMWORD[128+rsp]
+ movaps xmm14,XMMWORD[144+rsp]
+ movaps xmm15,XMMWORD[160+rsp]
+ lea rsp,[184+rsp]
+$L$dec_key_epilogue:
+ xor eax,eax
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_vpaes_set_decrypt_key:
+
+global vpaes_encrypt
+
+ALIGN 16
+vpaes_encrypt:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_vpaes_encrypt:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+%ifdef BORINGSSL_DISPATCH_TEST
+EXTERN BORINGSSL_function_hit
+ mov BYTE[((BORINGSSL_function_hit+4))],1
+%endif
+ lea rsp,[((-184))+rsp]
+ movaps XMMWORD[16+rsp],xmm6
+ movaps XMMWORD[32+rsp],xmm7
+ movaps XMMWORD[48+rsp],xmm8
+ movaps XMMWORD[64+rsp],xmm9
+ movaps XMMWORD[80+rsp],xmm10
+ movaps XMMWORD[96+rsp],xmm11
+ movaps XMMWORD[112+rsp],xmm12
+ movaps XMMWORD[128+rsp],xmm13
+ movaps XMMWORD[144+rsp],xmm14
+ movaps XMMWORD[160+rsp],xmm15
+$L$enc_body:
+ movdqu xmm0,XMMWORD[rdi]
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu XMMWORD[rsi],xmm0
+ movaps xmm6,XMMWORD[16+rsp]
+ movaps xmm7,XMMWORD[32+rsp]
+ movaps xmm8,XMMWORD[48+rsp]
+ movaps xmm9,XMMWORD[64+rsp]
+ movaps xmm10,XMMWORD[80+rsp]
+ movaps xmm11,XMMWORD[96+rsp]
+ movaps xmm12,XMMWORD[112+rsp]
+ movaps xmm13,XMMWORD[128+rsp]
+ movaps xmm14,XMMWORD[144+rsp]
+ movaps xmm15,XMMWORD[160+rsp]
+ lea rsp,[184+rsp]
+$L$enc_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_vpaes_encrypt:
+
+global vpaes_decrypt
+
+ALIGN 16
+vpaes_decrypt:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_vpaes_decrypt:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ lea rsp,[((-184))+rsp]
+ movaps XMMWORD[16+rsp],xmm6
+ movaps XMMWORD[32+rsp],xmm7
+ movaps XMMWORD[48+rsp],xmm8
+ movaps XMMWORD[64+rsp],xmm9
+ movaps XMMWORD[80+rsp],xmm10
+ movaps XMMWORD[96+rsp],xmm11
+ movaps XMMWORD[112+rsp],xmm12
+ movaps XMMWORD[128+rsp],xmm13
+ movaps XMMWORD[144+rsp],xmm14
+ movaps XMMWORD[160+rsp],xmm15
+$L$dec_body:
+ movdqu xmm0,XMMWORD[rdi]
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu XMMWORD[rsi],xmm0
+ movaps xmm6,XMMWORD[16+rsp]
+ movaps xmm7,XMMWORD[32+rsp]
+ movaps xmm8,XMMWORD[48+rsp]
+ movaps xmm9,XMMWORD[64+rsp]
+ movaps xmm10,XMMWORD[80+rsp]
+ movaps xmm11,XMMWORD[96+rsp]
+ movaps xmm12,XMMWORD[112+rsp]
+ movaps xmm13,XMMWORD[128+rsp]
+ movaps xmm14,XMMWORD[144+rsp]
+ movaps xmm15,XMMWORD[160+rsp]
+ lea rsp,[184+rsp]
+$L$dec_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_vpaes_decrypt:
+global vpaes_cbc_encrypt
+
+ALIGN 16
+vpaes_cbc_encrypt:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_vpaes_cbc_encrypt:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ xchg rdx,rcx
+ sub rcx,16
+ jc NEAR $L$cbc_abort
+ lea rsp,[((-184))+rsp]
+ movaps XMMWORD[16+rsp],xmm6
+ movaps XMMWORD[32+rsp],xmm7
+ movaps XMMWORD[48+rsp],xmm8
+ movaps XMMWORD[64+rsp],xmm9
+ movaps XMMWORD[80+rsp],xmm10
+ movaps XMMWORD[96+rsp],xmm11
+ movaps XMMWORD[112+rsp],xmm12
+ movaps XMMWORD[128+rsp],xmm13
+ movaps XMMWORD[144+rsp],xmm14
+ movaps XMMWORD[160+rsp],xmm15
+$L$cbc_body:
+ movdqu xmm6,XMMWORD[r8]
+ sub rsi,rdi
+ call _vpaes_preheat
+ cmp r9d,0
+ je NEAR $L$cbc_dec_loop
+ jmp NEAR $L$cbc_enc_loop
+ALIGN 16
+$L$cbc_enc_loop:
+ movdqu xmm0,XMMWORD[rdi]
+ pxor xmm0,xmm6
+ call _vpaes_encrypt_core
+ movdqa xmm6,xmm0
+ movdqu XMMWORD[rdi*1+rsi],xmm0
+ lea rdi,[16+rdi]
+ sub rcx,16
+ jnc NEAR $L$cbc_enc_loop
+ jmp NEAR $L$cbc_done
+ALIGN 16
+$L$cbc_dec_loop:
+ movdqu xmm0,XMMWORD[rdi]
+ movdqa xmm7,xmm0
+ call _vpaes_decrypt_core
+ pxor xmm0,xmm6
+ movdqa xmm6,xmm7
+ movdqu XMMWORD[rdi*1+rsi],xmm0
+ lea rdi,[16+rdi]
+ sub rcx,16
+ jnc NEAR $L$cbc_dec_loop
+$L$cbc_done:
+ movdqu XMMWORD[r8],xmm6
+ movaps xmm6,XMMWORD[16+rsp]
+ movaps xmm7,XMMWORD[32+rsp]
+ movaps xmm8,XMMWORD[48+rsp]
+ movaps xmm9,XMMWORD[64+rsp]
+ movaps xmm10,XMMWORD[80+rsp]
+ movaps xmm11,XMMWORD[96+rsp]
+ movaps xmm12,XMMWORD[112+rsp]
+ movaps xmm13,XMMWORD[128+rsp]
+ movaps xmm14,XMMWORD[144+rsp]
+ movaps xmm15,XMMWORD[160+rsp]
+ lea rsp,[184+rsp]
+$L$cbc_epilogue:
+$L$cbc_abort:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_vpaes_cbc_encrypt:
+global vpaes_ctr32_encrypt_blocks
+
+ALIGN 16
+vpaes_ctr32_encrypt_blocks:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_vpaes_ctr32_encrypt_blocks:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+
+ xchg rdx,rcx
+ test rcx,rcx
+ jz NEAR $L$ctr32_abort
+ lea rsp,[((-184))+rsp]
+ movaps XMMWORD[16+rsp],xmm6
+ movaps XMMWORD[32+rsp],xmm7
+ movaps XMMWORD[48+rsp],xmm8
+ movaps XMMWORD[64+rsp],xmm9
+ movaps XMMWORD[80+rsp],xmm10
+ movaps XMMWORD[96+rsp],xmm11
+ movaps XMMWORD[112+rsp],xmm12
+ movaps XMMWORD[128+rsp],xmm13
+ movaps XMMWORD[144+rsp],xmm14
+ movaps XMMWORD[160+rsp],xmm15
+$L$ctr32_body:
+ movdqu xmm0,XMMWORD[r8]
+ movdqa xmm8,XMMWORD[$L$ctr_add_one]
+ sub rsi,rdi
+ call _vpaes_preheat
+ movdqa xmm6,xmm0
+ pshufb xmm6,XMMWORD[$L$rev_ctr]
+
+ test rcx,1
+ jz NEAR $L$ctr32_prep_loop
+
+
+
+ movdqu xmm7,XMMWORD[rdi]
+ call _vpaes_encrypt_core
+ pxor xmm0,xmm7
+ paddd xmm6,xmm8
+ movdqu XMMWORD[rdi*1+rsi],xmm0
+ sub rcx,1
+ lea rdi,[16+rdi]
+ jz NEAR $L$ctr32_done
+
+$L$ctr32_prep_loop:
+
+
+ movdqa xmm14,xmm6
+ movdqa xmm15,xmm6
+ paddd xmm15,xmm8
+
+$L$ctr32_loop:
+ movdqa xmm1,XMMWORD[$L$rev_ctr]
+ movdqa xmm0,xmm14
+ movdqa xmm6,xmm15
+DB 102,15,56,0,193
+DB 102,15,56,0,241
+ call _vpaes_encrypt_core_2x
+ movdqu xmm1,XMMWORD[rdi]
+ movdqu xmm2,XMMWORD[16+rdi]
+ movdqa xmm3,XMMWORD[$L$ctr_add_two]
+ pxor xmm0,xmm1
+ pxor xmm6,xmm2
+ paddd xmm14,xmm3
+ paddd xmm15,xmm3
+ movdqu XMMWORD[rdi*1+rsi],xmm0
+ movdqu XMMWORD[16+rdi*1+rsi],xmm6
+ sub rcx,2
+ lea rdi,[32+rdi]
+ jnz NEAR $L$ctr32_loop
+
+$L$ctr32_done:
+ movaps xmm6,XMMWORD[16+rsp]
+ movaps xmm7,XMMWORD[32+rsp]
+ movaps xmm8,XMMWORD[48+rsp]
+ movaps xmm9,XMMWORD[64+rsp]
+ movaps xmm10,XMMWORD[80+rsp]
+ movaps xmm11,XMMWORD[96+rsp]
+ movaps xmm12,XMMWORD[112+rsp]
+ movaps xmm13,XMMWORD[128+rsp]
+ movaps xmm14,XMMWORD[144+rsp]
+ movaps xmm15,XMMWORD[160+rsp]
+ lea rsp,[184+rsp]
+$L$ctr32_epilogue:
+$L$ctr32_abort:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_vpaes_ctr32_encrypt_blocks:
+
+
+
+
+
+
+
+ALIGN 16
+_vpaes_preheat:
+
+ lea r10,[$L$k_s0F]
+ movdqa xmm10,XMMWORD[((-32))+r10]
+ movdqa xmm11,XMMWORD[((-16))+r10]
+ movdqa xmm9,XMMWORD[r10]
+ movdqa xmm13,XMMWORD[48+r10]
+ movdqa xmm12,XMMWORD[64+r10]
+ movdqa xmm15,XMMWORD[80+r10]
+ movdqa xmm14,XMMWORD[96+r10]
+ ret
+
+
+
+
+
+
+
+
+section .rdata rdata align=8
+ALIGN 64
+_vpaes_consts:
+$L$k_inv:
+ DQ 0x0E05060F0D080180,0x040703090A0B0C02
+ DQ 0x01040A060F0B0780,0x030D0E0C02050809
+
+$L$k_s0F:
+ DQ 0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F
+
+$L$k_ipt:
+ DQ 0xC2B2E8985A2A7000,0xCABAE09052227808
+ DQ 0x4C01307D317C4D00,0xCD80B1FCB0FDCC81
+
+$L$k_sb1:
+ DQ 0xB19BE18FCB503E00,0xA5DF7A6E142AF544
+ DQ 0x3618D415FAE22300,0x3BF7CCC10D2ED9EF
+$L$k_sb2:
+ DQ 0xE27A93C60B712400,0x5EB7E955BC982FCD
+ DQ 0x69EB88400AE12900,0xC2A163C8AB82234A
+$L$k_sbo:
+ DQ 0xD0D26D176FBDC700,0x15AABF7AC502A878
+ DQ 0xCFE474A55FBB6A00,0x8E1E90D1412B35FA
+
+$L$k_mc_forward:
+ DQ 0x0407060500030201,0x0C0F0E0D080B0A09
+ DQ 0x080B0A0904070605,0x000302010C0F0E0D
+ DQ 0x0C0F0E0D080B0A09,0x0407060500030201
+ DQ 0x000302010C0F0E0D,0x080B0A0904070605
+
+$L$k_mc_backward:
+ DQ 0x0605040702010003,0x0E0D0C0F0A09080B
+ DQ 0x020100030E0D0C0F,0x0A09080B06050407
+ DQ 0x0E0D0C0F0A09080B,0x0605040702010003
+ DQ 0x0A09080B06050407,0x020100030E0D0C0F
+
+$L$k_sr:
+ DQ 0x0706050403020100,0x0F0E0D0C0B0A0908
+ DQ 0x030E09040F0A0500,0x0B06010C07020D08
+ DQ 0x0F060D040B020900,0x070E050C030A0108
+ DQ 0x0B0E0104070A0D00,0x0306090C0F020508
+
+$L$k_rcon:
+ DQ 0x1F8391B9AF9DEEB6,0x702A98084D7C7D81
+
+$L$k_s63:
+ DQ 0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B
+
+$L$k_opt:
+ DQ 0xFF9F4929D6B66000,0xF7974121DEBE6808
+ DQ 0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0
+
+$L$k_deskew:
+ DQ 0x07E4A34047A4E300,0x1DFEB95A5DBEF91A
+ DQ 0x5F36B5DC83EA6900,0x2841C2ABF49D1E77
+
+
+
+
+
+$L$k_dksd:
+ DQ 0xFEB91A5DA3E44700,0x0740E3A45A1DBEF9
+ DQ 0x41C277F4B5368300,0x5FDC69EAAB289D1E
+$L$k_dksb:
+ DQ 0x9A4FCA1F8550D500,0x03D653861CC94C99
+ DQ 0x115BEDA7B6FC4A00,0xD993256F7E3482C8
+$L$k_dkse:
+ DQ 0xD5031CCA1FC9D600,0x53859A4C994F5086
+ DQ 0xA23196054FDC7BE8,0xCD5EF96A20B31487
+$L$k_dks9:
+ DQ 0xB6116FC87ED9A700,0x4AED933482255BFC
+ DQ 0x4576516227143300,0x8BB89FACE9DAFDCE
+
+
+
+
+
+$L$k_dipt:
+ DQ 0x0F505B040B545F00,0x154A411E114E451A
+ DQ 0x86E383E660056500,0x12771772F491F194
+
+$L$k_dsb9:
+ DQ 0x851C03539A86D600,0xCAD51F504F994CC9
+ DQ 0xC03B1789ECD74900,0x725E2C9EB2FBA565
+$L$k_dsbd:
+ DQ 0x7D57CCDFE6B1A200,0xF56E9B13882A4439
+ DQ 0x3CE2FAF724C6CB00,0x2931180D15DEEFD3
+$L$k_dsbb:
+ DQ 0xD022649296B44200,0x602646F6B0F2D404
+ DQ 0xC19498A6CD596700,0xF3FF0C3E3255AA6B
+$L$k_dsbe:
+ DQ 0x46F2929626D4D000,0x2242600464B4F6B0
+ DQ 0x0C55A6CDFFAAC100,0x9467F36B98593E32
+$L$k_dsbo:
+ DQ 0x1387EA537EF94000,0xC7AA6DB9D4943E2D
+ DQ 0x12D7560F93441D00,0xCA4B8159D8C58E9C
+
+
+$L$rev_ctr:
+ DQ 0x0706050403020100,0x0c0d0e0f0b0a0908
+
+
+$L$ctr_add_one:
+ DQ 0x0000000000000000,0x0000000100000000
+$L$ctr_add_two:
+ DQ 0x0000000000000000,0x0000000200000000
+
+ DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+ DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54
+ DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97
+ DB 109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32
+ DB 85,110,105,118,101,114,115,105,116,121,41,0
+ALIGN 64
+
+section .text
+
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ lea rsi,[16+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+ lea rax,[184+rax]
+
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_vpaes_set_encrypt_key wrt ..imagebase
+ DD $L$SEH_end_vpaes_set_encrypt_key wrt ..imagebase
+ DD $L$SEH_info_vpaes_set_encrypt_key wrt ..imagebase
+
+ DD $L$SEH_begin_vpaes_set_decrypt_key wrt ..imagebase
+ DD $L$SEH_end_vpaes_set_decrypt_key wrt ..imagebase
+ DD $L$SEH_info_vpaes_set_decrypt_key wrt ..imagebase
+
+ DD $L$SEH_begin_vpaes_encrypt wrt ..imagebase
+ DD $L$SEH_end_vpaes_encrypt wrt ..imagebase
+ DD $L$SEH_info_vpaes_encrypt wrt ..imagebase
+
+ DD $L$SEH_begin_vpaes_decrypt wrt ..imagebase
+ DD $L$SEH_end_vpaes_decrypt wrt ..imagebase
+ DD $L$SEH_info_vpaes_decrypt wrt ..imagebase
+
+ DD $L$SEH_begin_vpaes_cbc_encrypt wrt ..imagebase
+ DD $L$SEH_end_vpaes_cbc_encrypt wrt ..imagebase
+ DD $L$SEH_info_vpaes_cbc_encrypt wrt ..imagebase
+
+ DD $L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+ DD $L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+ DD $L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase
+
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_vpaes_set_encrypt_key:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_set_decrypt_key:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$dec_key_body wrt ..imagebase,$L$dec_key_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_encrypt:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$enc_body wrt ..imagebase,$L$enc_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_decrypt:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$dec_body wrt ..imagebase,$L$dec_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_cbc_encrypt:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$cbc_body wrt ..imagebase,$L$cbc_epilogue wrt ..imagebase
+$L$SEH_info_vpaes_ctr32_encrypt_blocks:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/x86-mont-apple.S b/gen/bcm/x86-mont-apple.S
new file mode 100644
index 0000000..f991f6c
--- /dev/null
+++ b/gen/bcm/x86-mont-apple.S
@@ -0,0 +1,484 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _bn_mul_mont
+.private_extern _bn_mul_mont
+.align 4
+_bn_mul_mont:
+L_bn_mul_mont_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ xorl %eax,%eax
+ movl 40(%esp),%edi
+ cmpl $4,%edi
+ jl L000just_leave
+ leal 20(%esp),%esi
+ leal 24(%esp),%edx
+ addl $2,%edi
+ negl %edi
+ leal -32(%esp,%edi,4),%ebp
+ negl %edi
+ movl %ebp,%eax
+ subl %edx,%eax
+ andl $2047,%eax
+ subl %eax,%ebp
+ xorl %ebp,%edx
+ andl $2048,%edx
+ xorl $2048,%edx
+ subl %edx,%ebp
+ andl $-64,%ebp
+ movl %esp,%eax
+ subl %ebp,%eax
+ andl $-4096,%eax
+ movl %esp,%edx
+ leal (%ebp,%eax,1),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja L001page_walk
+ jmp L002page_walk_done
+.align 4,0x90
+L001page_walk:
+ leal -4096(%esp),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja L001page_walk
+L002page_walk_done:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebp
+ movl 16(%esi),%esi
+ movl (%esi),%esi
+ movl %eax,4(%esp)
+ movl %ebx,8(%esp)
+ movl %ecx,12(%esp)
+ movl %ebp,16(%esp)
+ movl %esi,20(%esp)
+ leal -3(%edi),%ebx
+ movl %edx,24(%esp)
+ call L003PIC_me_up
+L003PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L004non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 4,0x90
+L0051st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl L0051st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+L006outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+L007inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz L007inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle L006outer
+ emms
+ jmp L008common_tail
+.align 4,0x90
+L004non_sse2:
+ movl 8(%esp),%esi
+ leal 1(%ebx),%ebp
+ movl 12(%esp),%edi
+ xorl %ecx,%ecx
+ movl %esi,%edx
+ andl $1,%ebp
+ subl %edi,%edx
+ leal 4(%edi,%ebx,4),%eax
+ orl %edx,%ebp
+ movl (%edi),%edi
+ jz L009bn_sqr_mont
+ movl %eax,28(%esp)
+ movl (%esi),%eax
+ xorl %edx,%edx
+.align 4,0x90
+L010mull:
+ movl %edx,%ebp
+ mull %edi
+ addl %eax,%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ movl (%esi,%ecx,4),%eax
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl L010mull
+ movl %edx,%ebp
+ mull %edi
+ movl 20(%esp),%edi
+ addl %ebp,%eax
+ movl 16(%esp),%esi
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ movl %eax,32(%esp,%ebx,4)
+ xorl %ecx,%ecx
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ movl (%esi),%eax
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ incl %ecx
+ jmp L0112ndmadd
+.align 4,0x90
+L0121stmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl L0121stmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ addl %eax,%ebp
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ xorl %ecx,%ecx
+ addl 36(%esp,%ebx,4),%edx
+ movl %ebp,32(%esp,%ebx,4)
+ adcl $0,%ecx
+ movl (%esi),%eax
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ movl $1,%ecx
+.align 4,0x90
+L0112ndmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl L0112ndmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ xorl %eax,%eax
+ movl 12(%esp),%ecx
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ leal 4(%ecx),%ecx
+ movl %edx,32(%esp,%ebx,4)
+ cmpl 28(%esp),%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je L008common_tail
+ movl (%ecx),%edi
+ movl 8(%esp),%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ movl (%esi),%eax
+ jmp L0121stmadd
+.align 4,0x90
+L009bn_sqr_mont:
+ movl %ebx,(%esp)
+ movl %ecx,12(%esp)
+ movl %edi,%eax
+ mull %edi
+ movl %eax,32(%esp)
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+ incl %ecx
+.align 4,0x90
+L013sqr:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ leal (%ebx,%eax,2),%ebp
+ shrl $31,%eax
+ cmpl (%esp),%ecx
+ movl %eax,%ebx
+ movl %ebp,28(%esp,%ecx,4)
+ jl L013sqr
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ leal (%ebx,%eax,2),%ebp
+ imull 32(%esp),%edi
+ shrl $31,%eax
+ movl %ebp,32(%esp,%ecx,4)
+ leal (%eax,%edx,2),%ebp
+ movl (%esi),%eax
+ shrl $31,%edx
+ movl %ebp,36(%esp,%ecx,4)
+ movl %edx,40(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl %ecx,%ebx
+ adcl $0,%edx
+ movl 4(%esi),%eax
+ movl $1,%ecx
+.align 4,0x90
+L0143rdmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl 4(%esi,%ecx,4),%eax
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %edx,%ebp
+ mull %edi
+ addl 36(%esp,%ecx,4),%ebp
+ leal 2(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl L0143rdmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ movl 12(%esp),%ecx
+ xorl %eax,%eax
+ movl 8(%esp),%esi
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ movl %edx,32(%esp,%ebx,4)
+ cmpl %ebx,%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je L008common_tail
+ movl 4(%esi,%ecx,4),%edi
+ leal 1(%ecx),%ecx
+ movl %edi,%eax
+ movl %ecx,12(%esp)
+ mull %edi
+ addl 32(%esp,%ecx,4),%eax
+ adcl $0,%edx
+ movl %eax,32(%esp,%ecx,4)
+ xorl %ebp,%ebp
+ cmpl %ebx,%ecx
+ leal 1(%ecx),%ecx
+ je L015sqrlast
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+.align 4,0x90
+L016sqradd:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal (%eax,%eax,1),%ebp
+ adcl $0,%edx
+ shrl $31,%eax
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%eax
+ addl %ebx,%ebp
+ adcl $0,%eax
+ cmpl (%esp),%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %eax,%ebx
+ jle L016sqradd
+ movl %edx,%ebp
+ addl %edx,%edx
+ shrl $31,%ebp
+ addl %ebx,%edx
+ adcl $0,%ebp
+L015sqrlast:
+ movl 20(%esp),%edi
+ movl 16(%esp),%esi
+ imull 32(%esp),%edi
+ addl 32(%esp,%ecx,4),%edx
+ movl (%esi),%eax
+ adcl $0,%ebp
+ movl %edx,32(%esp,%ecx,4)
+ movl %ebp,36(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ leal -1(%ecx),%ebx
+ adcl $0,%edx
+ movl $1,%ecx
+ movl 4(%esi),%eax
+ jmp L0143rdmadd
+.align 4,0x90
+L008common_tail:
+ movl 16(%esp),%ebp
+ movl 4(%esp),%edi
+ leal 32(%esp),%esi
+ movl (%esi),%eax
+ movl %ebx,%ecx
+ xorl %edx,%edx
+.align 4,0x90
+L017sub:
+ sbbl (%ebp,%edx,4),%eax
+ movl %eax,(%edi,%edx,4)
+ decl %ecx
+ movl 4(%esi,%edx,4),%eax
+ leal 1(%edx),%edx
+ jge L017sub
+ sbbl $0,%eax
+ movl $-1,%edx
+ xorl %eax,%edx
+ jmp L018copy
+.align 4,0x90
+L018copy:
+ movl 32(%esp,%ebx,4),%esi
+ movl (%edi,%ebx,4),%ebp
+ movl %ecx,32(%esp,%ebx,4)
+ andl %eax,%esi
+ andl %edx,%ebp
+ orl %esi,%ebp
+ movl %ebp,(%edi,%ebx,4)
+ decl %ebx
+ jge L018copy
+ movl 24(%esp),%esp
+ movl $1,%eax
+L000just_leave:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte 111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/bcm/x86-mont-linux.S b/gen/bcm/x86-mont-linux.S
new file mode 100644
index 0000000..e6b4ef5
--- /dev/null
+++ b/gen/bcm/x86-mont-linux.S
@@ -0,0 +1,482 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ xorl %eax,%eax
+ movl 40(%esp),%edi
+ cmpl $4,%edi
+ jl .L000just_leave
+ leal 20(%esp),%esi
+ leal 24(%esp),%edx
+ addl $2,%edi
+ negl %edi
+ leal -32(%esp,%edi,4),%ebp
+ negl %edi
+ movl %ebp,%eax
+ subl %edx,%eax
+ andl $2047,%eax
+ subl %eax,%ebp
+ xorl %ebp,%edx
+ andl $2048,%edx
+ xorl $2048,%edx
+ subl %edx,%ebp
+ andl $-64,%ebp
+ movl %esp,%eax
+ subl %ebp,%eax
+ andl $-4096,%eax
+ movl %esp,%edx
+ leal (%ebp,%eax,1),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja .L001page_walk
+ jmp .L002page_walk_done
+.align 16
+.L001page_walk:
+ leal -4096(%esp),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja .L001page_walk
+.L002page_walk_done:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebp
+ movl 16(%esi),%esi
+ movl (%esi),%esi
+ movl %eax,4(%esp)
+ movl %ebx,8(%esp)
+ movl %ecx,12(%esp)
+ movl %ebp,16(%esp)
+ movl %esi,20(%esp)
+ leal -3(%edi),%ebx
+ movl %edx,24(%esp)
+ call .L003PIC_me_up
+.L003PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L004non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 16
+.L0051st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl .L0051st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+.L006outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+.L007inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz .L007inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle .L006outer
+ emms
+ jmp .L008common_tail
+.align 16
+.L004non_sse2:
+ movl 8(%esp),%esi
+ leal 1(%ebx),%ebp
+ movl 12(%esp),%edi
+ xorl %ecx,%ecx
+ movl %esi,%edx
+ andl $1,%ebp
+ subl %edi,%edx
+ leal 4(%edi,%ebx,4),%eax
+ orl %edx,%ebp
+ movl (%edi),%edi
+ jz .L009bn_sqr_mont
+ movl %eax,28(%esp)
+ movl (%esi),%eax
+ xorl %edx,%edx
+.align 16
+.L010mull:
+ movl %edx,%ebp
+ mull %edi
+ addl %eax,%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ movl (%esi,%ecx,4),%eax
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L010mull
+ movl %edx,%ebp
+ mull %edi
+ movl 20(%esp),%edi
+ addl %ebp,%eax
+ movl 16(%esp),%esi
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ movl %eax,32(%esp,%ebx,4)
+ xorl %ecx,%ecx
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ movl (%esi),%eax
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ incl %ecx
+ jmp .L0112ndmadd
+.align 16
+.L0121stmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L0121stmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ addl %eax,%ebp
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ xorl %ecx,%ecx
+ addl 36(%esp,%ebx,4),%edx
+ movl %ebp,32(%esp,%ebx,4)
+ adcl $0,%ecx
+ movl (%esi),%eax
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ movl $1,%ecx
+.align 16
+.L0112ndmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0112ndmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ xorl %eax,%eax
+ movl 12(%esp),%ecx
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ leal 4(%ecx),%ecx
+ movl %edx,32(%esp,%ebx,4)
+ cmpl 28(%esp),%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L008common_tail
+ movl (%ecx),%edi
+ movl 8(%esp),%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ movl (%esi),%eax
+ jmp .L0121stmadd
+.align 16
+.L009bn_sqr_mont:
+ movl %ebx,(%esp)
+ movl %ecx,12(%esp)
+ movl %edi,%eax
+ mull %edi
+ movl %eax,32(%esp)
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+ incl %ecx
+.align 16
+.L013sqr:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ leal (%ebx,%eax,2),%ebp
+ shrl $31,%eax
+ cmpl (%esp),%ecx
+ movl %eax,%ebx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L013sqr
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ leal (%ebx,%eax,2),%ebp
+ imull 32(%esp),%edi
+ shrl $31,%eax
+ movl %ebp,32(%esp,%ecx,4)
+ leal (%eax,%edx,2),%ebp
+ movl (%esi),%eax
+ shrl $31,%edx
+ movl %ebp,36(%esp,%ecx,4)
+ movl %edx,40(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl %ecx,%ebx
+ adcl $0,%edx
+ movl 4(%esi),%eax
+ movl $1,%ecx
+.align 16
+.L0143rdmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl 4(%esi,%ecx,4),%eax
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %edx,%ebp
+ mull %edi
+ addl 36(%esp,%ecx,4),%ebp
+ leal 2(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0143rdmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ movl 12(%esp),%ecx
+ xorl %eax,%eax
+ movl 8(%esp),%esi
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ movl %edx,32(%esp,%ebx,4)
+ cmpl %ebx,%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L008common_tail
+ movl 4(%esi,%ecx,4),%edi
+ leal 1(%ecx),%ecx
+ movl %edi,%eax
+ movl %ecx,12(%esp)
+ mull %edi
+ addl 32(%esp,%ecx,4),%eax
+ adcl $0,%edx
+ movl %eax,32(%esp,%ecx,4)
+ xorl %ebp,%ebp
+ cmpl %ebx,%ecx
+ leal 1(%ecx),%ecx
+ je .L015sqrlast
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+.align 16
+.L016sqradd:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal (%eax,%eax,1),%ebp
+ adcl $0,%edx
+ shrl $31,%eax
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%eax
+ addl %ebx,%ebp
+ adcl $0,%eax
+ cmpl (%esp),%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %eax,%ebx
+ jle .L016sqradd
+ movl %edx,%ebp
+ addl %edx,%edx
+ shrl $31,%ebp
+ addl %ebx,%edx
+ adcl $0,%ebp
+.L015sqrlast:
+ movl 20(%esp),%edi
+ movl 16(%esp),%esi
+ imull 32(%esp),%edi
+ addl 32(%esp,%ecx,4),%edx
+ movl (%esi),%eax
+ adcl $0,%ebp
+ movl %edx,32(%esp,%ecx,4)
+ movl %ebp,36(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ leal -1(%ecx),%ebx
+ adcl $0,%edx
+ movl $1,%ecx
+ movl 4(%esi),%eax
+ jmp .L0143rdmadd
+.align 16
+.L008common_tail:
+ movl 16(%esp),%ebp
+ movl 4(%esp),%edi
+ leal 32(%esp),%esi
+ movl (%esi),%eax
+ movl %ebx,%ecx
+ xorl %edx,%edx
+.align 16
+.L017sub:
+ sbbl (%ebp,%edx,4),%eax
+ movl %eax,(%edi,%edx,4)
+ decl %ecx
+ movl 4(%esi,%edx,4),%eax
+ leal 1(%edx),%edx
+ jge .L017sub
+ sbbl $0,%eax
+ movl $-1,%edx
+ xorl %eax,%edx
+ jmp .L018copy
+.align 16
+.L018copy:
+ movl 32(%esp,%ebx,4),%esi
+ movl (%edi,%ebx,4),%ebp
+ movl %ecx,32(%esp,%ebx,4)
+ andl %eax,%esi
+ andl %edx,%ebp
+ orl %esi,%ebp
+ movl %ebp,(%edi,%ebx,4)
+ decl %ebx
+ jge .L018copy
+ movl 24(%esp),%esp
+ movl $1,%eax
+.L000just_leave:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte 111,114,103,62,0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/bcm/x86-mont-win.asm b/gen/bcm/x86-mont-win.asm
new file mode 100644
index 0000000..cd77529
--- /dev/null
+++ b/gen/bcm/x86-mont-win.asm
@@ -0,0 +1,490 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+;extern _OPENSSL_ia32cap_P
+global _bn_mul_mont
+align 16
+_bn_mul_mont:
+L$_bn_mul_mont_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ xor eax,eax
+ mov edi,DWORD [40+esp]
+ cmp edi,4
+ jl NEAR L$000just_leave
+ lea esi,[20+esp]
+ lea edx,[24+esp]
+ add edi,2
+ neg edi
+ lea ebp,[edi*4+esp-32]
+ neg edi
+ mov eax,ebp
+ sub eax,edx
+ and eax,2047
+ sub ebp,eax
+ xor edx,ebp
+ and edx,2048
+ xor edx,2048
+ sub ebp,edx
+ and ebp,-64
+ mov eax,esp
+ sub eax,ebp
+ and eax,-4096
+ mov edx,esp
+ lea esp,[eax*1+ebp]
+ mov eax,DWORD [esp]
+ cmp esp,ebp
+ ja NEAR L$001page_walk
+ jmp NEAR L$002page_walk_done
+align 16
+L$001page_walk:
+ lea esp,[esp-4096]
+ mov eax,DWORD [esp]
+ cmp esp,ebp
+ ja NEAR L$001page_walk
+L$002page_walk_done:
+ mov eax,DWORD [esi]
+ mov ebx,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov ebp,DWORD [12+esi]
+ mov esi,DWORD [16+esi]
+ mov esi,DWORD [esi]
+ mov DWORD [4+esp],eax
+ mov DWORD [8+esp],ebx
+ mov DWORD [12+esp],ecx
+ mov DWORD [16+esp],ebp
+ mov DWORD [20+esp],esi
+ lea ebx,[edi-3]
+ mov DWORD [24+esp],edx
+ lea eax,[_OPENSSL_ia32cap_P]
+ bt DWORD [eax],26
+ jnc NEAR L$003non_sse2
+ mov eax,-1
+ movd mm7,eax
+ mov esi,DWORD [8+esp]
+ mov edi,DWORD [12+esp]
+ mov ebp,DWORD [16+esp]
+ xor edx,edx
+ xor ecx,ecx
+ movd mm4,DWORD [edi]
+ movd mm5,DWORD [esi]
+ movd mm3,DWORD [ebp]
+ pmuludq mm5,mm4
+ movq mm2,mm5
+ movq mm0,mm5
+ pand mm0,mm7
+ pmuludq mm5,[20+esp]
+ pmuludq mm3,mm5
+ paddq mm3,mm0
+ movd mm1,DWORD [4+ebp]
+ movd mm0,DWORD [4+esi]
+ psrlq mm2,32
+ psrlq mm3,32
+ inc ecx
+align 16
+L$0041st:
+ pmuludq mm0,mm4
+ pmuludq mm1,mm5
+ paddq mm2,mm0
+ paddq mm3,mm1
+ movq mm0,mm2
+ pand mm0,mm7
+ movd mm1,DWORD [4+ecx*4+ebp]
+ paddq mm3,mm0
+ movd mm0,DWORD [4+ecx*4+esi]
+ psrlq mm2,32
+ movd DWORD [28+ecx*4+esp],mm3
+ psrlq mm3,32
+ lea ecx,[1+ecx]
+ cmp ecx,ebx
+ jl NEAR L$0041st
+ pmuludq mm0,mm4
+ pmuludq mm1,mm5
+ paddq mm2,mm0
+ paddq mm3,mm1
+ movq mm0,mm2
+ pand mm0,mm7
+ paddq mm3,mm0
+ movd DWORD [28+ecx*4+esp],mm3
+ psrlq mm2,32
+ psrlq mm3,32
+ paddq mm3,mm2
+ movq [32+ebx*4+esp],mm3
+ inc edx
+L$005outer:
+ xor ecx,ecx
+ movd mm4,DWORD [edx*4+edi]
+ movd mm5,DWORD [esi]
+ movd mm6,DWORD [32+esp]
+ movd mm3,DWORD [ebp]
+ pmuludq mm5,mm4
+ paddq mm5,mm6
+ movq mm0,mm5
+ movq mm2,mm5
+ pand mm0,mm7
+ pmuludq mm5,[20+esp]
+ pmuludq mm3,mm5
+ paddq mm3,mm0
+ movd mm6,DWORD [36+esp]
+ movd mm1,DWORD [4+ebp]
+ movd mm0,DWORD [4+esi]
+ psrlq mm2,32
+ psrlq mm3,32
+ paddq mm2,mm6
+ inc ecx
+ dec ebx
+L$006inner:
+ pmuludq mm0,mm4
+ pmuludq mm1,mm5
+ paddq mm2,mm0
+ paddq mm3,mm1
+ movq mm0,mm2
+ movd mm6,DWORD [36+ecx*4+esp]
+ pand mm0,mm7
+ movd mm1,DWORD [4+ecx*4+ebp]
+ paddq mm3,mm0
+ movd mm0,DWORD [4+ecx*4+esi]
+ psrlq mm2,32
+ movd DWORD [28+ecx*4+esp],mm3
+ psrlq mm3,32
+ paddq mm2,mm6
+ dec ebx
+ lea ecx,[1+ecx]
+ jnz NEAR L$006inner
+ mov ebx,ecx
+ pmuludq mm0,mm4
+ pmuludq mm1,mm5
+ paddq mm2,mm0
+ paddq mm3,mm1
+ movq mm0,mm2
+ pand mm0,mm7
+ paddq mm3,mm0
+ movd DWORD [28+ecx*4+esp],mm3
+ psrlq mm2,32
+ psrlq mm3,32
+ movd mm6,DWORD [36+ebx*4+esp]
+ paddq mm3,mm2
+ paddq mm3,mm6
+ movq [32+ebx*4+esp],mm3
+ lea edx,[1+edx]
+ cmp edx,ebx
+ jle NEAR L$005outer
+ emms
+ jmp NEAR L$007common_tail
+align 16
+L$003non_sse2:
+ mov esi,DWORD [8+esp]
+ lea ebp,[1+ebx]
+ mov edi,DWORD [12+esp]
+ xor ecx,ecx
+ mov edx,esi
+ and ebp,1
+ sub edx,edi
+ lea eax,[4+ebx*4+edi]
+ or ebp,edx
+ mov edi,DWORD [edi]
+ jz NEAR L$008bn_sqr_mont
+ mov DWORD [28+esp],eax
+ mov eax,DWORD [esi]
+ xor edx,edx
+align 16
+L$009mull:
+ mov ebp,edx
+ mul edi
+ add ebp,eax
+ lea ecx,[1+ecx]
+ adc edx,0
+ mov eax,DWORD [ecx*4+esi]
+ cmp ecx,ebx
+ mov DWORD [28+ecx*4+esp],ebp
+ jl NEAR L$009mull
+ mov ebp,edx
+ mul edi
+ mov edi,DWORD [20+esp]
+ add eax,ebp
+ mov esi,DWORD [16+esp]
+ adc edx,0
+ imul edi,DWORD [32+esp]
+ mov DWORD [32+ebx*4+esp],eax
+ xor ecx,ecx
+ mov DWORD [36+ebx*4+esp],edx
+ mov DWORD [40+ebx*4+esp],ecx
+ mov eax,DWORD [esi]
+ mul edi
+ add eax,DWORD [32+esp]
+ mov eax,DWORD [4+esi]
+ adc edx,0
+ inc ecx
+ jmp NEAR L$0102ndmadd
+align 16
+L$0111stmadd:
+ mov ebp,edx
+ mul edi
+ add ebp,DWORD [32+ecx*4+esp]
+ lea ecx,[1+ecx]
+ adc edx,0
+ add ebp,eax
+ mov eax,DWORD [ecx*4+esi]
+ adc edx,0
+ cmp ecx,ebx
+ mov DWORD [28+ecx*4+esp],ebp
+ jl NEAR L$0111stmadd
+ mov ebp,edx
+ mul edi
+ add eax,DWORD [32+ebx*4+esp]
+ mov edi,DWORD [20+esp]
+ adc edx,0
+ mov esi,DWORD [16+esp]
+ add ebp,eax
+ adc edx,0
+ imul edi,DWORD [32+esp]
+ xor ecx,ecx
+ add edx,DWORD [36+ebx*4+esp]
+ mov DWORD [32+ebx*4+esp],ebp
+ adc ecx,0
+ mov eax,DWORD [esi]
+ mov DWORD [36+ebx*4+esp],edx
+ mov DWORD [40+ebx*4+esp],ecx
+ mul edi
+ add eax,DWORD [32+esp]
+ mov eax,DWORD [4+esi]
+ adc edx,0
+ mov ecx,1
+align 16
+L$0102ndmadd:
+ mov ebp,edx
+ mul edi
+ add ebp,DWORD [32+ecx*4+esp]
+ lea ecx,[1+ecx]
+ adc edx,0
+ add ebp,eax
+ mov eax,DWORD [ecx*4+esi]
+ adc edx,0
+ cmp ecx,ebx
+ mov DWORD [24+ecx*4+esp],ebp
+ jl NEAR L$0102ndmadd
+ mov ebp,edx
+ mul edi
+ add ebp,DWORD [32+ebx*4+esp]
+ adc edx,0
+ add ebp,eax
+ adc edx,0
+ mov DWORD [28+ebx*4+esp],ebp
+ xor eax,eax
+ mov ecx,DWORD [12+esp]
+ add edx,DWORD [36+ebx*4+esp]
+ adc eax,DWORD [40+ebx*4+esp]
+ lea ecx,[4+ecx]
+ mov DWORD [32+ebx*4+esp],edx
+ cmp ecx,DWORD [28+esp]
+ mov DWORD [36+ebx*4+esp],eax
+ je NEAR L$007common_tail
+ mov edi,DWORD [ecx]
+ mov esi,DWORD [8+esp]
+ mov DWORD [12+esp],ecx
+ xor ecx,ecx
+ xor edx,edx
+ mov eax,DWORD [esi]
+ jmp NEAR L$0111stmadd
+align 16
+L$008bn_sqr_mont:
+ mov DWORD [esp],ebx
+ mov DWORD [12+esp],ecx
+ mov eax,edi
+ mul edi
+ mov DWORD [32+esp],eax
+ mov ebx,edx
+ shr edx,1
+ and ebx,1
+ inc ecx
+align 16
+L$012sqr:
+ mov eax,DWORD [ecx*4+esi]
+ mov ebp,edx
+ mul edi
+ add eax,ebp
+ lea ecx,[1+ecx]
+ adc edx,0
+ lea ebp,[eax*2+ebx]
+ shr eax,31
+ cmp ecx,DWORD [esp]
+ mov ebx,eax
+ mov DWORD [28+ecx*4+esp],ebp
+ jl NEAR L$012sqr
+ mov eax,DWORD [ecx*4+esi]
+ mov ebp,edx
+ mul edi
+ add eax,ebp
+ mov edi,DWORD [20+esp]
+ adc edx,0
+ mov esi,DWORD [16+esp]
+ lea ebp,[eax*2+ebx]
+ imul edi,DWORD [32+esp]
+ shr eax,31
+ mov DWORD [32+ecx*4+esp],ebp
+ lea ebp,[edx*2+eax]
+ mov eax,DWORD [esi]
+ shr edx,31
+ mov DWORD [36+ecx*4+esp],ebp
+ mov DWORD [40+ecx*4+esp],edx
+ mul edi
+ add eax,DWORD [32+esp]
+ mov ebx,ecx
+ adc edx,0
+ mov eax,DWORD [4+esi]
+ mov ecx,1
+align 16
+L$0133rdmadd:
+ mov ebp,edx
+ mul edi
+ add ebp,DWORD [32+ecx*4+esp]
+ adc edx,0
+ add ebp,eax
+ mov eax,DWORD [4+ecx*4+esi]
+ adc edx,0
+ mov DWORD [28+ecx*4+esp],ebp
+ mov ebp,edx
+ mul edi
+ add ebp,DWORD [36+ecx*4+esp]
+ lea ecx,[2+ecx]
+ adc edx,0
+ add ebp,eax
+ mov eax,DWORD [ecx*4+esi]
+ adc edx,0
+ cmp ecx,ebx
+ mov DWORD [24+ecx*4+esp],ebp
+ jl NEAR L$0133rdmadd
+ mov ebp,edx
+ mul edi
+ add ebp,DWORD [32+ebx*4+esp]
+ adc edx,0
+ add ebp,eax
+ adc edx,0
+ mov DWORD [28+ebx*4+esp],ebp
+ mov ecx,DWORD [12+esp]
+ xor eax,eax
+ mov esi,DWORD [8+esp]
+ add edx,DWORD [36+ebx*4+esp]
+ adc eax,DWORD [40+ebx*4+esp]
+ mov DWORD [32+ebx*4+esp],edx
+ cmp ecx,ebx
+ mov DWORD [36+ebx*4+esp],eax
+ je NEAR L$007common_tail
+ mov edi,DWORD [4+ecx*4+esi]
+ lea ecx,[1+ecx]
+ mov eax,edi
+ mov DWORD [12+esp],ecx
+ mul edi
+ add eax,DWORD [32+ecx*4+esp]
+ adc edx,0
+ mov DWORD [32+ecx*4+esp],eax
+ xor ebp,ebp
+ cmp ecx,ebx
+ lea ecx,[1+ecx]
+ je NEAR L$014sqrlast
+ mov ebx,edx
+ shr edx,1
+ and ebx,1
+align 16
+L$015sqradd:
+ mov eax,DWORD [ecx*4+esi]
+ mov ebp,edx
+ mul edi
+ add eax,ebp
+ lea ebp,[eax*1+eax]
+ adc edx,0
+ shr eax,31
+ add ebp,DWORD [32+ecx*4+esp]
+ lea ecx,[1+ecx]
+ adc eax,0
+ add ebp,ebx
+ adc eax,0
+ cmp ecx,DWORD [esp]
+ mov DWORD [28+ecx*4+esp],ebp
+ mov ebx,eax
+ jle NEAR L$015sqradd
+ mov ebp,edx
+ add edx,edx
+ shr ebp,31
+ add edx,ebx
+ adc ebp,0
+L$014sqrlast:
+ mov edi,DWORD [20+esp]
+ mov esi,DWORD [16+esp]
+ imul edi,DWORD [32+esp]
+ add edx,DWORD [32+ecx*4+esp]
+ mov eax,DWORD [esi]
+ adc ebp,0
+ mov DWORD [32+ecx*4+esp],edx
+ mov DWORD [36+ecx*4+esp],ebp
+ mul edi
+ add eax,DWORD [32+esp]
+ lea ebx,[ecx-1]
+ adc edx,0
+ mov ecx,1
+ mov eax,DWORD [4+esi]
+ jmp NEAR L$0133rdmadd
+align 16
+L$007common_tail:
+ mov ebp,DWORD [16+esp]
+ mov edi,DWORD [4+esp]
+ lea esi,[32+esp]
+ mov eax,DWORD [esi]
+ mov ecx,ebx
+ xor edx,edx
+align 16
+L$016sub:
+ sbb eax,DWORD [edx*4+ebp]
+ mov DWORD [edx*4+edi],eax
+ dec ecx
+ mov eax,DWORD [4+edx*4+esi]
+ lea edx,[1+edx]
+ jge NEAR L$016sub
+ sbb eax,0
+ mov edx,-1
+ xor edx,eax
+ jmp NEAR L$017copy
+align 16
+L$017copy:
+ mov esi,DWORD [32+ebx*4+esp]
+ mov ebp,DWORD [ebx*4+edi]
+ mov DWORD [32+ebx*4+esp],ecx
+ and esi,eax
+ and ebp,edx
+ or ebp,esi
+ mov DWORD [ebx*4+edi],ebp
+ dec ebx
+ jge NEAR L$017copy
+ mov esp,DWORD [24+esp]
+ mov eax,1
+L$000just_leave:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+db 111,114,103,62,0
+segment .bss
+common _OPENSSL_ia32cap_P 16
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/x86_64-mont-apple.S b/gen/bcm/x86_64-mont-apple.S
new file mode 100644
index 0000000..4bf0c6d
--- /dev/null
+++ b/gen/bcm/x86_64-mont-apple.S
@@ -0,0 +1,1235 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+.globl _bn_mul_mont_nohw
+.private_extern _bn_mul_mont_nohw
+
+.p2align 4
+_bn_mul_mont_nohw:
+
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -16(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+ jmp L$mul_page_walk_done
+
+.p2align 4
+L$mul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+L$mul_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+
+L$mul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$1st_enter
+
+.p2align 4
+L$1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp L$outer
+.p2align 4
+L$outer:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$inner_enter
+
+.p2align 4
+L$inner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$inner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$inner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb L$outer
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ movq %r9,%r15
+
+.p2align 4
+L$sub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsp,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz L$sub
+
+ sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
+ xorq %r14,%r14
+ movq %r9,%r15
+
+L$copy:
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
+ movq %r9,(%rsp,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz L$copy
+
+ movq 8(%rsp,%r9,8),%rsi
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$mul_epilogue:
+ ret
+
+
+.globl _bn_mul4x_mont
+.private_extern _bn_mul4x_mont
+
+.p2align 4
+_bn_mul4x_mont:
+
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -32(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul4x_page_walk
+ jmp L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+
+L$mul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp L$1st4x
+.p2align 4
+L$1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb L$1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.p2align 2
+L$outer4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp L$inner4x
+.p2align 4
+L$inner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb L$inner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jb L$outer4x
+ movq 16(%rsp,%r9,8),%rdi
+ leaq -4(%r9),%r15
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdx
+ shrq $2,%r15
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+
+L$sub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz L$sub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,224
+ pcmpeqd %xmm5,%xmm5
+ pshufd $0,%xmm4,%xmm4
+ movq %r9,%r15
+ pxor %xmm4,%xmm5
+ shrq $2,%r15
+ xorl %eax,%eax
+
+ jmp L$copy4x
+.p2align 4
+L$copy4x:
+ movdqa (%rsp,%rax,1),%xmm1
+ movdqu (%rdi,%rax,1),%xmm2
+ pand %xmm4,%xmm1
+ pand %xmm5,%xmm2
+ movdqa 16(%rsp,%rax,1),%xmm3
+ movdqa %xmm0,(%rsp,%rax,1)
+ por %xmm2,%xmm1
+ movdqu 16(%rdi,%rax,1),%xmm2
+ movdqu %xmm1,(%rdi,%rax,1)
+ pand %xmm4,%xmm3
+ pand %xmm5,%xmm2
+ movdqa %xmm0,16(%rsp,%rax,1)
+ por %xmm2,%xmm3
+ movdqu %xmm3,16(%rdi,%rax,1)
+ leaq 32(%rax),%rax
+ decq %r15
+ jnz L$copy4x
+ movq 8(%rsp,%r9,8),%rsi
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$mul4x_epilogue:
+ ret
+
+
+
+
+
+.globl _bn_sqr8x_mont
+.private_extern _bn_sqr8x_mont
+
+.p2align 5
+_bn_sqr8x_mont:
+
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$sqr8x_prologue:
+
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shlq $3+2,%r10
+ negq %r9
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ movq (%r8),%r8
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$sqr8x_sp_alt
+ subq %r11,%rbp
+ leaq -64(%rbp,%r9,2),%rbp
+ jmp L$sqr8x_sp_done
+
+.p2align 5
+L$sqr8x_sp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+L$sqr8x_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$sqr8x_page_walk
+ jmp L$sqr8x_page_walk_done
+
+.p2align 4
+L$sqr8x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$sqr8x_page_walk
+L$sqr8x_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+
+L$sqr8x_body:
+
+.byte 102,72,15,110,209
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,73,15,110,218
+ testq %rdx,%rdx
+ jz L$sqr8x_nox
+
+ call _bn_sqrx8x_internal
+
+
+
+
+ leaq (%r8,%rcx,1),%rbx
+ movq %rcx,%r9
+ movq %rcx,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp L$sqr8x_sub
+
+.p2align 5
+L$sqr8x_nox:
+ call _bn_sqr8x_internal
+
+
+
+
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+ movq %r9,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp L$sqr8x_sub
+
+.p2align 5
+L$sqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz L$sqr8x_sub
+
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+
+.byte 102,72,15,110,200
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+
+ jmp L$sqr8x_cond_copy
+
+.p2align 5
+L$sqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz L$sqr8x_cond_copy
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$sqr8x_epilogue:
+ ret
+
+
+.globl _bn_mulx4x_mont
+.private_extern _bn_mulx4x_mont
+
+.p2align 5
+_bn_mulx4x_mont:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$mulx4x_prologue:
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,1),%rbp
+ andq $-128,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mulx4x_page_walk
+ jmp L$mulx4x_page_walk_done
+
+.p2align 4
+L$mulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mulx4x_page_walk
+L$mulx4x_page_walk_done:
+
+ leaq (%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r9,0(%rsp)
+ shrq $5,%r9
+ movq %r10,16(%rsp)
+ subq $1,%r9
+ movq %r8,24(%rsp)
+ movq %rdi,32(%rsp)
+ movq %rax,40(%rsp)
+
+ movq %r9,48(%rsp)
+ jmp L$mulx4x_body
+
+.p2align 5
+L$mulx4x_body:
+ leaq 8(%rdx),%rdi
+ movq (%rdx),%rdx
+ leaq 64+32(%rsp),%rbx
+ movq %rdx,%r9
+
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r14
+ addq %rax,%r11
+ movq %rdi,8(%rsp)
+ mulxq 16(%rsi),%r12,%r13
+ adcq %r14,%r12
+ adcq $0,%r13
+
+ movq %r8,%rdi
+ imulq 24(%rsp),%r8
+ xorq %rbp,%rbp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%rdi
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+ movq 48(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+
+ jmp L$mulx4x_1st
+
+.p2align 5
+L$mulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz L$mulx4x_1st
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ addq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+ jmp L$mulx4x_outer
+
+.p2align 5
+L$mulx4x_outer:
+ movq (%rdi),%rdx
+ leaq 8(%rdi),%rdi
+ subq %rax,%rsi
+ movq %r15,(%rbx)
+ leaq 64+32(%rsp),%rbx
+ subq %rax,%rcx
+
+ mulxq 0(%rsi),%r8,%r11
+ xorl %ebp,%ebp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ adoxq -16(%rbx),%r12
+ adcxq %rbp,%r13
+ adoxq %rbp,%r13
+
+ movq %rdi,8(%rsp)
+ movq %r8,%r15
+ imulq 24(%rsp),%r8
+ xorl %ebp,%ebp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ adoxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ leaq 32(%rcx),%rcx
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ movq 48(%rsp),%rdi
+ movq %r12,-16(%rbx)
+
+ jmp L$mulx4x_inner
+
+.p2align 5
+L$mulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-32(%rbx)
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz L$mulx4x_inner
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ subq 0(%rbx),%rbp
+ adcq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+
+ cmpq 16(%rsp),%rdi
+ jne L$mulx4x_outer
+
+ leaq 64(%rsp),%rbx
+ subq %rax,%rcx
+ negq %r15
+ movq %rax,%rdx
+ shrq $3+2,%rax
+ movq 32(%rsp),%rdi
+ jmp L$mulx4x_sub
+
+.p2align 5
+L$mulx4x_sub:
+ movq 0(%rbx),%r11
+ movq 8(%rbx),%r12
+ movq 16(%rbx),%r13
+ movq 24(%rbx),%r14
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rcx),%r11
+ sbbq 8(%rcx),%r12
+ sbbq 16(%rcx),%r13
+ sbbq 24(%rcx),%r14
+ leaq 32(%rcx),%rcx
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+ movq %r13,16(%rdi)
+ movq %r14,24(%rdi)
+ leaq 32(%rdi),%rdi
+ decq %rax
+ jnz L$mulx4x_sub
+
+ sbbq $0,%r15
+ leaq 64(%rsp),%rbx
+ subq %rdx,%rdi
+
+.byte 102,73,15,110,207
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+
+ jmp L$mulx4x_cond_copy
+
+.p2align 5
+L$mulx4x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ subq $32,%rdx
+ jnz L$mulx4x_cond_copy
+
+ movq %rdx,(%rbx)
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$mulx4x_epilogue:
+ ret
+
+
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 4
+#endif
diff --git a/gen/bcm/x86_64-mont-linux.S b/gen/bcm/x86_64-mont-linux.S
new file mode 100644
index 0000000..02b282d
--- /dev/null
+++ b/gen/bcm/x86_64-mont-linux.S
@@ -0,0 +1,1237 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.globl bn_mul_mont_nohw
+.hidden bn_mul_mont_nohw
+.type bn_mul_mont_nohw,@function
+.align 16
+bn_mul_mont_nohw:
+.cfi_startproc
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -16(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.align 16
+.Lmul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ movq %r9,%r15
+
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsp,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
+ xorq %r14,%r14
+ movq %r9,%r15
+
+.Lcopy:
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
+ movq %r9,(%rsp,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_epilogue:
+ ret
+.cfi_endproc
+.size bn_mul_mont_nohw,.-bn_mul_mont_nohw
+.globl bn_mul4x_mont
+.hidden bn_mul4x_mont
+.type bn_mul4x_mont,@function
+.align 16
+bn_mul4x_mont:
+.cfi_startproc
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -32(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jb .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ leaq -4(%r9),%r15
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdx
+ shrq $2,%r15
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,224
+ pcmpeqd %xmm5,%xmm5
+ pshufd $0,%xmm4,%xmm4
+ movq %r9,%r15
+ pxor %xmm4,%xmm5
+ shrq $2,%r15
+ xorl %eax,%eax
+
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqa (%rsp,%rax,1),%xmm1
+ movdqu (%rdi,%rax,1),%xmm2
+ pand %xmm4,%xmm1
+ pand %xmm5,%xmm2
+ movdqa 16(%rsp,%rax,1),%xmm3
+ movdqa %xmm0,(%rsp,%rax,1)
+ por %xmm2,%xmm1
+ movdqu 16(%rdi,%rax,1),%xmm2
+ movdqu %xmm1,(%rdi,%rax,1)
+ pand %xmm4,%xmm3
+ pand %xmm5,%xmm2
+ movdqa %xmm0,16(%rsp,%rax,1)
+ por %xmm2,%xmm3
+ movdqu %xmm3,16(%rdi,%rax,1)
+ leaq 32(%rax),%rax
+ decq %r15
+ jnz .Lcopy4x
+ movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi, 8
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul4x_epilogue:
+ ret
+.cfi_endproc
+.size bn_mul4x_mont,.-bn_mul4x_mont
+.extern bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.extern bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+
+.globl bn_sqr8x_mont
+.hidden bn_sqr8x_mont
+.type bn_sqr8x_mont,@function
+.align 32
+bn_sqr8x_mont:
+.cfi_startproc
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lsqr8x_prologue:
+
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shlq $3+2,%r10
+ negq %r9
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ movq (%r8),%r8
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lsqr8x_sp_alt
+ subq %r11,%rbp
+ leaq -64(%rbp,%r9,2),%rbp
+ jmp .Lsqr8x_sp_done
+
+.align 32
+.Lsqr8x_sp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lsqr8x_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+ jmp .Lsqr8x_page_walk_done
+
+.align 16
+.Lsqr8x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lsqr8x_body:
+
+.byte 102,72,15,110,209
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,73,15,110,218
+ testq %rdx,%rdx
+ jz .Lsqr8x_nox
+
+ call bn_sqrx8x_internal
+
+
+
+
+ leaq (%r8,%rcx,1),%rbx
+ movq %rcx,%r9
+ movq %rcx,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_nox:
+ call bn_sqr8x_internal
+
+
+
+
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+ movq %r9,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz .Lsqr8x_sub
+
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+
+.byte 102,72,15,110,200
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ jmp .Lsqr8x_cond_copy
+
+.align 32
+.Lsqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz .Lsqr8x_cond_copy
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lsqr8x_epilogue:
+ ret
+.cfi_endproc
+.size bn_sqr8x_mont,.-bn_sqr8x_mont
+.globl bn_mulx4x_mont
+.hidden bn_mulx4x_mont
+.type bn_mulx4x_mont,@function
+.align 32
+bn_mulx4x_mont:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,1),%rbp
+ andq $-128,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+ leaq (%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r9,0(%rsp)
+ shrq $5,%r9
+ movq %r10,16(%rsp)
+ subq $1,%r9
+ movq %r8,24(%rsp)
+ movq %rdi,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+ movq %r9,48(%rsp)
+ jmp .Lmulx4x_body
+
+.align 32
+.Lmulx4x_body:
+ leaq 8(%rdx),%rdi
+ movq (%rdx),%rdx
+ leaq 64+32(%rsp),%rbx
+ movq %rdx,%r9
+
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r14
+ addq %rax,%r11
+ movq %rdi,8(%rsp)
+ mulxq 16(%rsi),%r12,%r13
+ adcq %r14,%r12
+ adcq $0,%r13
+
+ movq %r8,%rdi
+ imulq 24(%rsp),%r8
+ xorq %rbp,%rbp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%rdi
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+ movq 48(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ addq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ movq (%rdi),%rdx
+ leaq 8(%rdi),%rdi
+ subq %rax,%rsi
+ movq %r15,(%rbx)
+ leaq 64+32(%rsp),%rbx
+ subq %rax,%rcx
+
+ mulxq 0(%rsi),%r8,%r11
+ xorl %ebp,%ebp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ adoxq -16(%rbx),%r12
+ adcxq %rbp,%r13
+ adoxq %rbp,%r13
+
+ movq %rdi,8(%rsp)
+ movq %r8,%r15
+ imulq 24(%rsp),%r8
+ xorl %ebp,%ebp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ adoxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ leaq 32(%rcx),%rcx
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ movq 48(%rsp),%rdi
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-32(%rbx)
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ subq 0(%rbx),%rbp
+ adcq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+
+ cmpq 16(%rsp),%rdi
+ jne .Lmulx4x_outer
+
+ leaq 64(%rsp),%rbx
+ subq %rax,%rcx
+ negq %r15
+ movq %rax,%rdx
+ shrq $3+2,%rax
+ movq 32(%rsp),%rdi
+ jmp .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+ movq 0(%rbx),%r11
+ movq 8(%rbx),%r12
+ movq 16(%rbx),%r13
+ movq 24(%rbx),%r14
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rcx),%r11
+ sbbq 8(%rcx),%r12
+ sbbq 16(%rcx),%r13
+ sbbq 24(%rcx),%r14
+ leaq 32(%rcx),%rcx
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+ movq %r13,16(%rdi)
+ movq %r14,24(%rdi)
+ leaq 32(%rdi),%rdi
+ decq %rax
+ jnz .Lmulx4x_sub
+
+ sbbq $0,%r15
+ leaq 64(%rsp),%rbx
+ subq %rdx,%rdi
+
+.byte 102,73,15,110,207
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ jmp .Lmulx4x_cond_copy
+
+.align 32
+.Lmulx4x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ subq $32,%rdx
+ jnz .Lmulx4x_cond_copy
+
+ movq %rdx,(%rbx)
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ ret
+.cfi_endproc
+.size bn_mulx4x_mont,.-bn_mulx4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
+#endif
diff --git a/gen/bcm/x86_64-mont-win.asm b/gen/bcm/x86_64-mont-win.asm
new file mode 100644
index 0000000..b0611fc
--- /dev/null
+++ b/gen/bcm/x86_64-mont-win.asm
@@ -0,0 +1,1470 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+global bn_mul_mont_nohw
+
+ALIGN 16
+bn_mul_mont_nohw:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_mul_mont_nohw:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ mov r9d,r9d
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+
+ neg r9
+ mov r11,rsp
+ lea r10,[((-16))+r9*8+rsp]
+ neg r9
+ and r10,-1024
+
+
+
+
+
+
+
+
+
+ sub r11,r10
+ and r11,-4096
+ lea rsp,[r11*1+r10]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+ jmp NEAR $L$mul_page_walk_done
+
+ALIGN 16
+$L$mul_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+ mov QWORD[8+r9*8+rsp],rax
+
+$L$mul_body:
+ mov r12,rdx
+ mov r8,QWORD[r8]
+ mov rbx,QWORD[r12]
+ mov rax,QWORD[rsi]
+
+ xor r14,r14
+ xor r15,r15
+
+ mov rbp,r8
+ mul rbx
+ mov r10,rax
+ mov rax,QWORD[rcx]
+
+ imul rbp,r10
+ mov r11,rdx
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+rsi]
+ adc rdx,0
+ mov r13,rdx
+
+ lea r15,[1+r15]
+ jmp NEAR $L$1st_enter
+
+ALIGN 16
+$L$1st:
+ add r13,rax
+ mov rax,QWORD[r15*8+rsi]
+ adc rdx,0
+ add r13,r11
+ mov r11,r10
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],r13
+ mov r13,rdx
+
+$L$1st_enter:
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[r15*8+rcx]
+ adc rdx,0
+ lea r15,[1+r15]
+ mov r10,rdx
+
+ mul rbp
+ cmp r15,r9
+ jne NEAR $L$1st
+
+ add r13,rax
+ mov rax,QWORD[rsi]
+ adc rdx,0
+ add r13,r11
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],r13
+ mov r13,rdx
+ mov r11,r10
+
+ xor rdx,rdx
+ add r13,r11
+ adc rdx,0
+ mov QWORD[((-8))+r9*8+rsp],r13
+ mov QWORD[r9*8+rsp],rdx
+
+ lea r14,[1+r14]
+ jmp NEAR $L$outer
+ALIGN 16
+$L$outer:
+ mov rbx,QWORD[r14*8+r12]
+ xor r15,r15
+ mov rbp,r8
+ mov r10,QWORD[rsp]
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[rcx]
+ adc rdx,0
+
+ imul rbp,r10
+ mov r11,rdx
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+rsi]
+ adc rdx,0
+ mov r10,QWORD[8+rsp]
+ mov r13,rdx
+
+ lea r15,[1+r15]
+ jmp NEAR $L$inner_enter
+
+ALIGN 16
+$L$inner:
+ add r13,rax
+ mov rax,QWORD[r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ mov r10,QWORD[r15*8+rsp]
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],r13
+ mov r13,rdx
+
+$L$inner_enter:
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[r15*8+rcx]
+ adc rdx,0
+ add r10,r11
+ mov r11,rdx
+ adc r11,0
+ lea r15,[1+r15]
+
+ mul rbp
+ cmp r15,r9
+ jne NEAR $L$inner
+
+ add r13,rax
+ mov rax,QWORD[rsi]
+ adc rdx,0
+ add r13,r10
+ mov r10,QWORD[r15*8+rsp]
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],r13
+ mov r13,rdx
+
+ xor rdx,rdx
+ add r13,r11
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-8))+r9*8+rsp],r13
+ mov QWORD[r9*8+rsp],rdx
+
+ lea r14,[1+r14]
+ cmp r14,r9
+ jb NEAR $L$outer
+
+ xor r14,r14
+ mov rax,QWORD[rsp]
+ mov r15,r9
+
+ALIGN 16
+$L$sub: sbb rax,QWORD[r14*8+rcx]
+ mov QWORD[r14*8+rdi],rax
+ mov rax,QWORD[8+r14*8+rsp]
+ lea r14,[1+r14]
+ dec r15
+ jnz NEAR $L$sub
+
+ sbb rax,0
+ mov rbx,-1
+ xor rbx,rax
+ xor r14,r14
+ mov r15,r9
+
+$L$copy:
+ mov rcx,QWORD[r14*8+rdi]
+ mov rdx,QWORD[r14*8+rsp]
+ and rcx,rbx
+ and rdx,rax
+ mov QWORD[r14*8+rsp],r9
+ or rdx,rcx
+ mov QWORD[r14*8+rdi],rdx
+ lea r14,[1+r14]
+ sub r15,1
+ jnz NEAR $L$copy
+
+ mov rsi,QWORD[8+r9*8+rsp]
+
+ mov rax,1
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$mul_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_mul_mont_nohw:
+global bn_mul4x_mont
+
+ALIGN 16
+bn_mul4x_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_mul4x_mont:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ mov r9d,r9d
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+
+ neg r9
+ mov r11,rsp
+ lea r10,[((-32))+r9*8+rsp]
+ neg r9
+ and r10,-1024
+
+ sub r11,r10
+ and r11,-4096
+ lea rsp,[r11*1+r10]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul4x_page_walk
+ jmp NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+ mov QWORD[8+r9*8+rsp],rax
+
+$L$mul4x_body:
+ mov QWORD[16+r9*8+rsp],rdi
+ mov r12,rdx
+ mov r8,QWORD[r8]
+ mov rbx,QWORD[r12]
+ mov rax,QWORD[rsi]
+
+ xor r14,r14
+ xor r15,r15
+
+ mov rbp,r8
+ mul rbx
+ mov r10,rax
+ mov rax,QWORD[rcx]
+
+ imul rbp,r10
+ mov r11,rdx
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+rsi]
+ adc rdx,0
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+rcx]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[16+rsi]
+ adc rdx,0
+ add rdi,r11
+ lea r15,[4+r15]
+ adc rdx,0
+ mov QWORD[rsp],rdi
+ mov r13,rdx
+ jmp NEAR $L$1st4x
+ALIGN 16
+$L$1st4x:
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+r15*8+rcx]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-24))+r15*8+rsp],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[((-8))+r15*8+rcx]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[r15*8+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],rdi
+ mov r13,rdx
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[r15*8+rcx]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[8+r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-8))+r15*8+rsp],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+r15*8+rcx]
+ adc rdx,0
+ lea r15,[4+r15]
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[((-16))+r15*8+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-32))+r15*8+rsp],rdi
+ mov r13,rdx
+ cmp r15,r9
+ jb NEAR $L$1st4x
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+r15*8+rcx]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-24))+r15*8+rsp],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[((-8))+r15*8+rcx]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],rdi
+ mov r13,rdx
+
+ xor rdi,rdi
+ add r13,r10
+ adc rdi,0
+ mov QWORD[((-8))+r15*8+rsp],r13
+ mov QWORD[r15*8+rsp],rdi
+
+ lea r14,[1+r14]
+ALIGN 4
+$L$outer4x:
+ mov rbx,QWORD[r14*8+r12]
+ xor r15,r15
+ mov r10,QWORD[rsp]
+ mov rbp,r8
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[rcx]
+ adc rdx,0
+
+ imul rbp,r10
+ mov r11,rdx
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+rsi]
+ adc rdx,0
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+rcx]
+ adc rdx,0
+ add r11,QWORD[8+rsp]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[16+rsi]
+ adc rdx,0
+ add rdi,r11
+ lea r15,[4+r15]
+ adc rdx,0
+ mov QWORD[rsp],rdi
+ mov r13,rdx
+ jmp NEAR $L$inner4x
+ALIGN 16
+$L$inner4x:
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+r15*8+rcx]
+ adc rdx,0
+ add r10,QWORD[((-16))+r15*8+rsp]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-24))+r15*8+rsp],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[((-8))+r15*8+rcx]
+ adc rdx,0
+ add r11,QWORD[((-8))+r15*8+rsp]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[r15*8+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],rdi
+ mov r13,rdx
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[r15*8+rcx]
+ adc rdx,0
+ add r10,QWORD[r15*8+rsp]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[8+r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-8))+r15*8+rsp],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+r15*8+rcx]
+ adc rdx,0
+ add r11,QWORD[8+r15*8+rsp]
+ adc rdx,0
+ lea r15,[4+r15]
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[((-16))+r15*8+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-32))+r15*8+rsp],rdi
+ mov r13,rdx
+ cmp r15,r9
+ jb NEAR $L$inner4x
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+r15*8+rcx]
+ adc rdx,0
+ add r10,QWORD[((-16))+r15*8+rsp]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-24))+r15*8+rsp],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[((-8))+r15*8+rcx]
+ adc rdx,0
+ add r11,QWORD[((-8))+r15*8+rsp]
+ adc rdx,0
+ lea r14,[1+r14]
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],rdi
+ mov r13,rdx
+
+ xor rdi,rdi
+ add r13,r10
+ adc rdi,0
+ add r13,QWORD[r9*8+rsp]
+ adc rdi,0
+ mov QWORD[((-8))+r15*8+rsp],r13
+ mov QWORD[r15*8+rsp],rdi
+
+ cmp r14,r9
+ jb NEAR $L$outer4x
+ mov rdi,QWORD[16+r9*8+rsp]
+ lea r15,[((-4))+r9]
+ mov rax,QWORD[rsp]
+ mov rdx,QWORD[8+rsp]
+ shr r15,2
+ lea rsi,[rsp]
+ xor r14,r14
+
+ sub rax,QWORD[rcx]
+ mov rbx,QWORD[16+rsi]
+ mov rbp,QWORD[24+rsi]
+ sbb rdx,QWORD[8+rcx]
+
+$L$sub4x:
+ mov QWORD[r14*8+rdi],rax
+ mov QWORD[8+r14*8+rdi],rdx
+ sbb rbx,QWORD[16+r14*8+rcx]
+ mov rax,QWORD[32+r14*8+rsi]
+ mov rdx,QWORD[40+r14*8+rsi]
+ sbb rbp,QWORD[24+r14*8+rcx]
+ mov QWORD[16+r14*8+rdi],rbx
+ mov QWORD[24+r14*8+rdi],rbp
+ sbb rax,QWORD[32+r14*8+rcx]
+ mov rbx,QWORD[48+r14*8+rsi]
+ mov rbp,QWORD[56+r14*8+rsi]
+ sbb rdx,QWORD[40+r14*8+rcx]
+ lea r14,[4+r14]
+ dec r15
+ jnz NEAR $L$sub4x
+
+ mov QWORD[r14*8+rdi],rax
+ mov rax,QWORD[32+r14*8+rsi]
+ sbb rbx,QWORD[16+r14*8+rcx]
+ mov QWORD[8+r14*8+rdi],rdx
+ sbb rbp,QWORD[24+r14*8+rcx]
+ mov QWORD[16+r14*8+rdi],rbx
+
+ sbb rax,0
+ mov QWORD[24+r14*8+rdi],rbp
+ pxor xmm0,xmm0
+DB 102,72,15,110,224
+ pcmpeqd xmm5,xmm5
+ pshufd xmm4,xmm4,0
+ mov r15,r9
+ pxor xmm5,xmm4
+ shr r15,2
+ xor eax,eax
+
+ jmp NEAR $L$copy4x
+ALIGN 16
+$L$copy4x:
+ movdqa xmm1,XMMWORD[rax*1+rsp]
+ movdqu xmm2,XMMWORD[rax*1+rdi]
+ pand xmm1,xmm4
+ pand xmm2,xmm5
+ movdqa xmm3,XMMWORD[16+rax*1+rsp]
+ movdqa XMMWORD[rax*1+rsp],xmm0
+ por xmm1,xmm2
+ movdqu xmm2,XMMWORD[16+rax*1+rdi]
+ movdqu XMMWORD[rax*1+rdi],xmm1
+ pand xmm3,xmm4
+ pand xmm2,xmm5
+ movdqa XMMWORD[16+rax*1+rsp],xmm0
+ por xmm3,xmm2
+ movdqu XMMWORD[16+rax*1+rdi],xmm3
+ lea rax,[32+rax]
+ dec r15
+ jnz NEAR $L$copy4x
+ mov rsi,QWORD[8+r9*8+rsp]
+
+ mov rax,1
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$mul4x_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_mul4x_mont:
+EXTERN bn_sqrx8x_internal
+EXTERN bn_sqr8x_internal
+
+global bn_sqr8x_mont
+
+ALIGN 32
+bn_sqr8x_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_sqr8x_mont:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ mov r9d,r9d
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$sqr8x_prologue:
+
+ mov r10d,r9d
+ shl r9d,3
+ shl r10,3+2
+ neg r9
+
+
+
+
+
+
+ lea r11,[((-64))+r9*2+rsp]
+ mov rbp,rsp
+ mov r8,QWORD[r8]
+ sub r11,rsi
+ and r11,4095
+ cmp r10,r11
+ jb NEAR $L$sqr8x_sp_alt
+ sub rbp,r11
+ lea rbp,[((-64))+r9*2+rbp]
+ jmp NEAR $L$sqr8x_sp_done
+
+ALIGN 32
+$L$sqr8x_sp_alt:
+ lea r10,[((4096-64))+r9*2]
+ lea rbp,[((-64))+r9*2+rbp]
+ sub r11,r10
+ mov r10,0
+ cmovc r11,r10
+ sub rbp,r11
+$L$sqr8x_sp_done:
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$sqr8x_page_walk
+ jmp NEAR $L$sqr8x_page_walk_done
+
+ALIGN 16
+$L$sqr8x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$sqr8x_page_walk
+$L$sqr8x_page_walk_done:
+
+ mov r10,r9
+ neg r9
+
+ mov QWORD[32+rsp],r8
+ mov QWORD[40+rsp],rax
+
+$L$sqr8x_body:
+
+DB 102,72,15,110,209
+ pxor xmm0,xmm0
+DB 102,72,15,110,207
+DB 102,73,15,110,218
+ test rdx,rdx
+ jz NEAR $L$sqr8x_nox
+
+ call bn_sqrx8x_internal
+
+
+
+
+ lea rbx,[rcx*1+r8]
+ mov r9,rcx
+ mov rdx,rcx
+DB 102,72,15,126,207
+ sar rcx,3+2
+ jmp NEAR $L$sqr8x_sub
+
+ALIGN 32
+$L$sqr8x_nox:
+ call bn_sqr8x_internal
+
+
+
+
+ lea rbx,[r9*1+rdi]
+ mov rcx,r9
+ mov rdx,r9
+DB 102,72,15,126,207
+ sar rcx,3+2
+ jmp NEAR $L$sqr8x_sub
+
+ALIGN 32
+$L$sqr8x_sub:
+ mov r12,QWORD[rbx]
+ mov r13,QWORD[8+rbx]
+ mov r14,QWORD[16+rbx]
+ mov r15,QWORD[24+rbx]
+ lea rbx,[32+rbx]
+ sbb r12,QWORD[rbp]
+ sbb r13,QWORD[8+rbp]
+ sbb r14,QWORD[16+rbp]
+ sbb r15,QWORD[24+rbp]
+ lea rbp,[32+rbp]
+ mov QWORD[rdi],r12
+ mov QWORD[8+rdi],r13
+ mov QWORD[16+rdi],r14
+ mov QWORD[24+rdi],r15
+ lea rdi,[32+rdi]
+ inc rcx
+ jnz NEAR $L$sqr8x_sub
+
+ sbb rax,0
+ lea rbx,[r9*1+rbx]
+ lea rdi,[r9*1+rdi]
+
+DB 102,72,15,110,200
+ pxor xmm0,xmm0
+ pshufd xmm1,xmm1,0
+ mov rsi,QWORD[40+rsp]
+
+ jmp NEAR $L$sqr8x_cond_copy
+
+ALIGN 32
+$L$sqr8x_cond_copy:
+ movdqa xmm2,XMMWORD[rbx]
+ movdqa xmm3,XMMWORD[16+rbx]
+ lea rbx,[32+rbx]
+ movdqu xmm4,XMMWORD[rdi]
+ movdqu xmm5,XMMWORD[16+rdi]
+ lea rdi,[32+rdi]
+ movdqa XMMWORD[(-32)+rbx],xmm0
+ movdqa XMMWORD[(-16)+rbx],xmm0
+ movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0
+ movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0
+ pcmpeqd xmm0,xmm1
+ pand xmm2,xmm1
+ pand xmm3,xmm1
+ pand xmm4,xmm0
+ pand xmm5,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqu XMMWORD[(-32)+rdi],xmm4
+ movdqu XMMWORD[(-16)+rdi],xmm5
+ add r9,32
+ jnz NEAR $L$sqr8x_cond_copy
+
+ mov rax,1
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$sqr8x_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_sqr8x_mont:
+global bn_mulx4x_mont
+
+ALIGN 32
+bn_mulx4x_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_mulx4x_mont:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$mulx4x_prologue:
+
+ shl r9d,3
+ xor r10,r10
+ sub r10,r9
+ mov r8,QWORD[r8]
+ lea rbp,[((-72))+r10*1+rsp]
+ and rbp,-128
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mulx4x_page_walk
+ jmp NEAR $L$mulx4x_page_walk_done
+
+ALIGN 16
+$L$mulx4x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+ lea r10,[r9*1+rdx]
+
+
+
+
+
+
+
+
+
+
+
+
+ mov QWORD[rsp],r9
+ shr r9,5
+ mov QWORD[16+rsp],r10
+ sub r9,1
+ mov QWORD[24+rsp],r8
+ mov QWORD[32+rsp],rdi
+ mov QWORD[40+rsp],rax
+
+ mov QWORD[48+rsp],r9
+ jmp NEAR $L$mulx4x_body
+
+ALIGN 32
+$L$mulx4x_body:
+ lea rdi,[8+rdx]
+ mov rdx,QWORD[rdx]
+ lea rbx,[((64+32))+rsp]
+ mov r9,rdx
+
+ mulx rax,r8,QWORD[rsi]
+ mulx r14,r11,QWORD[8+rsi]
+ add r11,rax
+ mov QWORD[8+rsp],rdi
+ mulx r13,r12,QWORD[16+rsi]
+ adc r12,r14
+ adc r13,0
+
+ mov rdi,r8
+ imul r8,QWORD[24+rsp]
+ xor rbp,rbp
+
+ mulx r14,rax,QWORD[24+rsi]
+ mov rdx,r8
+ lea rsi,[32+rsi]
+ adcx r13,rax
+ adcx r14,rbp
+
+ mulx r10,rax,QWORD[rcx]
+ adcx rdi,rax
+ adox r10,r11
+ mulx r11,rax,QWORD[8+rcx]
+ adcx r10,rax
+ adox r11,r12
+ DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+ mov rdi,QWORD[48+rsp]
+ mov QWORD[((-32))+rbx],r10
+ adcx r11,rax
+ adox r12,r13
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ mov QWORD[((-24))+rbx],r11
+ adcx r12,rax
+ adox r15,rbp
+ lea rcx,[32+rcx]
+ mov QWORD[((-16))+rbx],r12
+
+ jmp NEAR $L$mulx4x_1st
+
+ALIGN 32
+$L$mulx4x_1st:
+ adcx r15,rbp
+ mulx rax,r10,QWORD[rsi]
+ adcx r10,r14
+ mulx r14,r11,QWORD[8+rsi]
+ adcx r11,rax
+ mulx rax,r12,QWORD[16+rsi]
+ adcx r12,r14
+ mulx r14,r13,QWORD[24+rsi]
+ DB 0x67,0x67
+ mov rdx,r8
+ adcx r13,rax
+ adcx r14,rbp
+ lea rsi,[32+rsi]
+ lea rbx,[32+rbx]
+
+ adox r10,r15
+ mulx r15,rax,QWORD[rcx]
+ adcx r10,rax
+ adox r11,r15
+ mulx r15,rax,QWORD[8+rcx]
+ adcx r11,rax
+ adox r12,r15
+ mulx r15,rax,QWORD[16+rcx]
+ mov QWORD[((-40))+rbx],r10
+ adcx r12,rax
+ mov QWORD[((-32))+rbx],r11
+ adox r13,r15
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ mov QWORD[((-24))+rbx],r12
+ adcx r13,rax
+ adox r15,rbp
+ lea rcx,[32+rcx]
+ mov QWORD[((-16))+rbx],r13
+
+ dec rdi
+ jnz NEAR $L$mulx4x_1st
+
+ mov rax,QWORD[rsp]
+ mov rdi,QWORD[8+rsp]
+ adc r15,rbp
+ add r14,r15
+ sbb r15,r15
+ mov QWORD[((-8))+rbx],r14
+ jmp NEAR $L$mulx4x_outer
+
+ALIGN 32
+$L$mulx4x_outer:
+ mov rdx,QWORD[rdi]
+ lea rdi,[8+rdi]
+ sub rsi,rax
+ mov QWORD[rbx],r15
+ lea rbx,[((64+32))+rsp]
+ sub rcx,rax
+
+ mulx r11,r8,QWORD[rsi]
+ xor ebp,ebp
+ mov r9,rdx
+ mulx r12,r14,QWORD[8+rsi]
+ adox r8,QWORD[((-32))+rbx]
+ adcx r11,r14
+ mulx r13,r15,QWORD[16+rsi]
+ adox r11,QWORD[((-24))+rbx]
+ adcx r12,r15
+ adox r12,QWORD[((-16))+rbx]
+ adcx r13,rbp
+ adox r13,rbp
+
+ mov QWORD[8+rsp],rdi
+ mov r15,r8
+ imul r8,QWORD[24+rsp]
+ xor ebp,ebp
+
+ mulx r14,rax,QWORD[24+rsi]
+ mov rdx,r8
+ adcx r13,rax
+ adox r13,QWORD[((-8))+rbx]
+ adcx r14,rbp
+ lea rsi,[32+rsi]
+ adox r14,rbp
+
+ mulx r10,rax,QWORD[rcx]
+ adcx r15,rax
+ adox r10,r11
+ mulx r11,rax,QWORD[8+rcx]
+ adcx r10,rax
+ adox r11,r12
+ mulx r12,rax,QWORD[16+rcx]
+ mov QWORD[((-32))+rbx],r10
+ adcx r11,rax
+ adox r12,r13
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ mov QWORD[((-24))+rbx],r11
+ lea rcx,[32+rcx]
+ adcx r12,rax
+ adox r15,rbp
+ mov rdi,QWORD[48+rsp]
+ mov QWORD[((-16))+rbx],r12
+
+ jmp NEAR $L$mulx4x_inner
+
+ALIGN 32
+$L$mulx4x_inner:
+ mulx rax,r10,QWORD[rsi]
+ adcx r15,rbp
+ adox r10,r14
+ mulx r14,r11,QWORD[8+rsi]
+ adcx r10,QWORD[rbx]
+ adox r11,rax
+ mulx rax,r12,QWORD[16+rsi]
+ adcx r11,QWORD[8+rbx]
+ adox r12,r14
+ mulx r14,r13,QWORD[24+rsi]
+ mov rdx,r8
+ adcx r12,QWORD[16+rbx]
+ adox r13,rax
+ adcx r13,QWORD[24+rbx]
+ adox r14,rbp
+ lea rsi,[32+rsi]
+ lea rbx,[32+rbx]
+ adcx r14,rbp
+
+ adox r10,r15
+ mulx r15,rax,QWORD[rcx]
+ adcx r10,rax
+ adox r11,r15
+ mulx r15,rax,QWORD[8+rcx]
+ adcx r11,rax
+ adox r12,r15
+ mulx r15,rax,QWORD[16+rcx]
+ mov QWORD[((-40))+rbx],r10
+ adcx r12,rax
+ adox r13,r15
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ mov QWORD[((-32))+rbx],r11
+ mov QWORD[((-24))+rbx],r12
+ adcx r13,rax
+ adox r15,rbp
+ lea rcx,[32+rcx]
+ mov QWORD[((-16))+rbx],r13
+
+ dec rdi
+ jnz NEAR $L$mulx4x_inner
+
+ mov rax,QWORD[rsp]
+ mov rdi,QWORD[8+rsp]
+ adc r15,rbp
+ sub rbp,QWORD[rbx]
+ adc r14,r15
+ sbb r15,r15
+ mov QWORD[((-8))+rbx],r14
+
+ cmp rdi,QWORD[16+rsp]
+ jne NEAR $L$mulx4x_outer
+
+ lea rbx,[64+rsp]
+ sub rcx,rax
+ neg r15
+ mov rdx,rax
+ shr rax,3+2
+ mov rdi,QWORD[32+rsp]
+ jmp NEAR $L$mulx4x_sub
+
+ALIGN 32
+$L$mulx4x_sub:
+ mov r11,QWORD[rbx]
+ mov r12,QWORD[8+rbx]
+ mov r13,QWORD[16+rbx]
+ mov r14,QWORD[24+rbx]
+ lea rbx,[32+rbx]
+ sbb r11,QWORD[rcx]
+ sbb r12,QWORD[8+rcx]
+ sbb r13,QWORD[16+rcx]
+ sbb r14,QWORD[24+rcx]
+ lea rcx,[32+rcx]
+ mov QWORD[rdi],r11
+ mov QWORD[8+rdi],r12
+ mov QWORD[16+rdi],r13
+ mov QWORD[24+rdi],r14
+ lea rdi,[32+rdi]
+ dec rax
+ jnz NEAR $L$mulx4x_sub
+
+ sbb r15,0
+ lea rbx,[64+rsp]
+ sub rdi,rdx
+
+DB 102,73,15,110,207
+ pxor xmm0,xmm0
+ pshufd xmm1,xmm1,0
+ mov rsi,QWORD[40+rsp]
+
+ jmp NEAR $L$mulx4x_cond_copy
+
+ALIGN 32
+$L$mulx4x_cond_copy:
+ movdqa xmm2,XMMWORD[rbx]
+ movdqa xmm3,XMMWORD[16+rbx]
+ lea rbx,[32+rbx]
+ movdqu xmm4,XMMWORD[rdi]
+ movdqu xmm5,XMMWORD[16+rdi]
+ lea rdi,[32+rdi]
+ movdqa XMMWORD[(-32)+rbx],xmm0
+ movdqa XMMWORD[(-16)+rbx],xmm0
+ pcmpeqd xmm0,xmm1
+ pand xmm2,xmm1
+ pand xmm3,xmm1
+ pand xmm4,xmm0
+ pand xmm5,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqu XMMWORD[(-32)+rdi],xmm4
+ movdqu XMMWORD[(-16)+rdi],xmm5
+ sub rdx,32
+ jnz NEAR $L$mulx4x_cond_copy
+
+ mov QWORD[rbx],rdx
+
+ mov rax,1
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$mulx4x_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_mulx4x_mont:
+ DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+ DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+ DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
+ DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+ DB 115,108,46,111,114,103,62,0
+ALIGN 16
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+mul_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov r10,QWORD[192+r8]
+ mov rax,QWORD[8+r10*8+rax]
+
+ jmp NEAR $L$common_pop_regs
+
+
+
+ALIGN 16
+sqr_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_pop_regs
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[8+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov rax,QWORD[40+rax]
+
+$L$common_pop_regs:
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_bn_mul_mont_nohw wrt ..imagebase
+ DD $L$SEH_end_bn_mul_mont_nohw wrt ..imagebase
+ DD $L$SEH_info_bn_mul_mont_nohw wrt ..imagebase
+
+ DD $L$SEH_begin_bn_mul4x_mont wrt ..imagebase
+ DD $L$SEH_end_bn_mul4x_mont wrt ..imagebase
+ DD $L$SEH_info_bn_mul4x_mont wrt ..imagebase
+
+ DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
+ DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase
+ DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase
+ DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase
+ DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase
+ DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_bn_mul_mont_nohw:
+ DB 9,0,0,0
+ DD mul_handler wrt ..imagebase
+ DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+$L$SEH_info_bn_mul4x_mont:
+ DB 9,0,0,0
+ DD mul_handler wrt ..imagebase
+ DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+$L$SEH_info_bn_sqr8x_mont:
+ DB 9,0,0,0
+ DD sqr_handler wrt ..imagebase
+ DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+ALIGN 8
+$L$SEH_info_bn_mulx4x_mont:
+ DB 9,0,0,0
+ DD sqr_handler wrt ..imagebase
+ DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN 8
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/bcm/x86_64-mont5-apple.S b/gen/bcm/x86_64-mont5-apple.S
new file mode 100644
index 0000000..bd63d91
--- /dev/null
+++ b/gen/bcm/x86_64-mont5-apple.S
@@ -0,0 +1,3624 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+
+.globl _bn_mul_mont_gather5
+.private_extern _bn_mul_mont_gather5
+
+.p2align 6
+_bn_mul_mont_gather5:
+
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+
+ testl $7,%r9d
+ jnz L$mul_enter
+ leaq _OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ jmp L$mul4x_enter
+
+.p2align 4
+L$mul_enter:
+ movd 8(%rsp),%xmm5
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -280(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+ jmp L$mul_page_walk_done
+
+L$mul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+L$mul_page_walk_done:
+
+ leaq L$inc(%rip),%r10
+ movq %rax,8(%rsp,%r9,8)
+
+L$mul_body:
+
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$1st_enter
+
+.p2align 4
+L$1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$1st
+
+
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r9,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp L$outer
+.p2align 4
+L$outer:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$inner_enter
+
+.p2align 4
+L$inner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$inner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$inner
+
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r9,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r9,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb L$outer
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp L$sub
+.p2align 4
+L$sub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz L$sub
+
+ sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
+ xorq %r14,%r14
+ movq %r9,%r15
+
+L$copy:
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
+ movq %r14,(%rsp,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz L$copy
+
+ movq 8(%rsp,%r9,8),%rsi
+
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$mul_epilogue:
+ ret
+
+
+
+.p2align 5
+bn_mul4x_mont_gather5:
+
+.byte 0x67
+ movq %rsp,%rax
+
+L$mul4x_enter:
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je L$mulx4x_enter
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$mul4x_prologue:
+
+.byte 0x67
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$mul4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp L$mul4xsp_done
+
+.p2align 5
+L$mul4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+L$mul4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mul4x_page_walk
+ jmp L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+ negq %r9
+
+ movq %rax,40(%rsp)
+
+L$mul4x_body:
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$mul4x_epilogue:
+ ret
+
+
+
+
+.p2align 5
+mul4x_internal:
+
+ shlq $5,%r9
+ movd 8(%rax),%xmm5
+ leaq L$inc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5,%r9
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq %r13,16+8(%rsp)
+ movq %rdi,56+8(%rsp)
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+ leaq (%rsi,%r9,1),%rsi
+ negq %r9
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ leaq 64+8(%rsp),%r14
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+ jmp L$1st4x
+
+.p2align 5
+L$1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz L$1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ leaq (%rcx,%r9,1),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ jmp L$outer4x
+
+.p2align 5
+L$outer4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq (%r14,%r9,1),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+ movq %rdi,(%r14)
+
+ leaq (%r14,%r9,1),%r14
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp L$inner4x
+
+.p2align 5
+L$inner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ addq (%r14),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz L$inner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq %rbp,%rax
+ movq -8(%rcx),%rbp
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ movq %rdi,-16(%r14)
+ leaq (%rcx,%r9,1),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%r14),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ cmpq 16+8(%rsp),%r12
+ jb L$outer4x
+ xorq %rax,%rax
+ subq %r13,%rbp
+ adcq %r15,%r15
+ orq %r15,%rdi
+ subq %rdi,%rax
+ leaq (%r14,%r9,1),%rbx
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
+ movq %r9,%rcx
+ sarq $3+2,%rcx
+ movq 56+8(%rsp),%rdi
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqr4x_sub_entry
+
+
+.globl _bn_power5
+.private_extern _bn_power5
+
+.p2align 5
+_bn_power5:
+
+_CET_ENDBR
+ movq %rsp,%rax
+
+ leaq _OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je L$powerx5_enter
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$power5_prologue:
+
+ shll $3,%r9d
+ leal (%r9,%r9,2),%r10d
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$pwr_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp L$pwr_sp_done
+
+.p2align 5
+L$pwr_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+L$pwr_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$pwr_page_walk
+ jmp L$pwr_page_walk_done
+
+L$pwr_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$pwr_page_walk
+L$pwr_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+
+L$power5_body:
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq %rsi,%rdi
+ movq 40(%rsp),%rax
+ leaq 32(%rsp),%r8
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$power5_epilogue:
+ ret
+
+
+
+.globl _bn_sqr8x_internal
+.private_extern _bn_sqr8x_internal
+.private_extern _bn_sqr8x_internal
+
+.p2align 5
+_bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r11,-16(%rdi,%rbp,1)
+ movq %rdx,%r10
+
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ leaq (%rbp),%rcx
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp L$sqr4x_1st
+
+.p2align 5
+L$sqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq 16(%rsi,%rcx,1),%rbx
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %r10,8(%rdi,%rcx,1)
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 24(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,16(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+ leaq 32(%rcx),%rcx
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne L$sqr4x_1st
+
+ mulq %r15
+ addq %rax,%r13
+ leaq 16(%rbp),%rbp
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+ jmp L$sqr4x_outer
+
+.p2align 5
+L$sqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq -24(%rdi,%rbp,1),%r10
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r10,-24(%rdi,%rbp,1)
+ movq %rdx,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -16(%rdi,%rbp,1),%r11
+ movq %rdx,%r10
+ adcq $0,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ xorq %r12,%r12
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -8(%rdi,%rbp,1),%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rbp,1)
+
+ leaq (%rbp),%rcx
+ jmp L$sqr4x_inner
+
+.p2align 5
+L$sqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+
+.byte 0x67
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %r11,(%rdi,%rcx,1)
+ movq %rbx,%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ leaq 16(%rcx),%rcx
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne L$sqr4x_inner
+
+.byte 0x67
+ mulq %r15
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ addq $16,%rbp
+ jnz L$sqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq %r10,-24(%rdi)
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ movq -8(%rsi),%rbx
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,-16(%rdi)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi)
+
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 48+8(%rsp),%rdi
+ xorq %r10,%r10
+ movq 8(%rdi),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ jmp L$sqr4x_shift_n_add
+
+.p2align 5
+L$sqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ addq $32,%rbp
+ jnz L$sqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+.byte 0x67
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+.byte 102,72,15,126,213
+__bn_sqr8x_reduction:
+ xorq %rax,%rax
+ leaq (%r9,%rbp,1),%rcx
+ leaq 48+8(%rsp,%r9,2),%rdx
+ movq %rcx,0+8(%rsp)
+ leaq 48+8(%rsp,%r9,1),%rdi
+ movq %rdx,8+8(%rsp)
+ negq %r9
+ jmp L$8x_reduction_loop
+
+.p2align 5
+L$8x_reduction_loop:
+ leaq (%rdi,%r9,1),%rdi
+.byte 0x66
+ movq 0(%rdi),%rbx
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,(%rdx)
+ leaq 64(%rdi),%rdi
+
+.byte 0x67
+ movq %rbx,%r8
+ imulq 32+8(%rsp),%rbx
+ movq 0(%rbp),%rax
+ movl $8,%ecx
+ jmp L$8x_reduce
+
+.p2align 5
+L$8x_reduce:
+ mulq %rbx
+ movq 8(%rbp),%rax
+ negq %r8
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rbx,48-8+8(%rsp,%rcx,8)
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq 32+8(%rsp),%rsi
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ imulq %r8,%rsi
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq %rsi,%rbx
+ addq %rax,%r15
+ movq 0(%rbp),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz L$8x_reduce
+
+ leaq 64(%rbp),%rbp
+ xorq %rax,%rax
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae L$8x_no_tail
+
+.byte 0x66
+ addq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movq 48+56+8(%rsp),%rbx
+ movl $8,%ecx
+ movq 0(%rbp),%rax
+ jmp L$8x_tail
+
+.p2align 5
+L$8x_tail:
+ mulq %rbx
+ addq %rax,%r8
+ movq 8(%rbp),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ leaq 8(%rdi),%rdi
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq 48-16+8(%rsp,%rcx,8),%rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq 0(%rbp),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz L$8x_tail
+
+ leaq 64(%rbp),%rbp
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae L$8x_tail_done
+
+ movq 48+56+8(%rsp),%rbx
+ negq %rsi
+ movq 0(%rbp),%rax
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movl $8,%ecx
+ jmp L$8x_tail
+
+.p2align 5
+L$8x_tail_done:
+ xorq %rax,%rax
+ addq (%rdx),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ negq %rsi
+L$8x_no_tail:
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+ movq -8(%rbp),%rcx
+ xorq %rsi,%rsi
+
+.byte 102,72,15,126,213
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+.byte 102,73,15,126,217
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+ leaq 64(%rdi),%rdi
+
+ cmpq %rdx,%rdi
+ jb L$8x_reduction_loop
+ ret
+
+
+
+.p2align 5
+__bn_post4x_internal:
+
+ movq 0(%rbp),%r12
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+.byte 102,72,15,126,207
+ negq %rax
+.byte 102,72,15,126,206
+ sarq $3+2,%rcx
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqr4x_sub_entry
+
+.p2align 4
+L$sqr4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+L$sqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
+ movq %r12,0(%rdi)
+ leaq 32(%rbx),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r10,%r10
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+
+ incq %rcx
+ jnz L$sqr4x_sub
+
+ movq %r9,%r10
+ negq %r9
+ ret
+
+
+
+.p2align 5
+bn_mulx4x_mont_gather5:
+
+ movq %rsp,%rax
+
+L$mulx4x_enter:
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$mulx4x_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$mulx4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp L$mulx4xsp_done
+
+L$mulx4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+L$mulx4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mulx4x_page_walk
+ jmp L$mulx4x_page_walk_done
+
+L$mulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mulx4x_page_walk
+L$mulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+
+L$mulx4x_body:
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$mulx4x_epilogue:
+ ret
+
+
+
+
+.p2align 5
+mulx4x_internal:
+
+ movq %r9,8(%rsp)
+ movq %r9,%r10
+ negq %r9
+ shlq $5,%r9
+ negq %r10
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5+5,%r9
+ movd 8(%rax),%xmm5
+ subq $1,%r9
+ leaq L$inc(%rip),%rax
+ movq %r13,16+8(%rsp)
+ movq %r9,24+8(%rsp)
+ movq %rdi,56+8(%rsp)
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r10,1),%r10
+ leaq 128(%rdx),%rdi
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67
+ movdqa %xmm1,%xmm2
+.byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+.byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+
+ pand 64(%rdi),%xmm0
+ pand 80(%rdi),%xmm1
+ pand 96(%rdi),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%rdi),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%rdi),%xmm4
+ movdqa -112(%rdi),%xmm5
+ movdqa -96(%rdi),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%rdi),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%rdi),%xmm4
+ movdqa -48(%rdi),%xmm5
+ movdqa -32(%rdi),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%rdi),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%rdi),%xmm4
+ movdqa 16(%rdi),%xmm5
+ movdqa 32(%rdi),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%rdi),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ pxor %xmm1,%xmm0
+
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+ leaq 64+32+8(%rsp),%rbx
+
+ movq %rdx,%r9
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r12
+ addq %rax,%r11
+ mulxq 16(%rsi),%rax,%r13
+ adcq %rax,%r12
+ adcq $0,%r13
+ mulxq 24(%rsi),%rax,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+ xorq %rbp,%rbp
+ movq %r8,%rdx
+
+ movq %rdi,8+8(%rsp)
+
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+ jmp L$mulx4x_1st
+
+.p2align 5
+L$mulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz L$mulx4x_1st
+
+ movq 8(%rsp),%rax
+ adcq %rbp,%r15
+ leaq (%rsi,%rax,1),%rsi
+ addq %r15,%r14
+ movq 8+8(%rsp),%rdi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+ jmp L$mulx4x_outer
+
+.p2align 5
+L$mulx4x_outer:
+ leaq 16-256(%rbx),%r10
+ pxor %xmm4,%xmm4
+.byte 0x67,0x67
+ pxor %xmm5,%xmm5
+ movdqa -128(%rdi),%xmm0
+ movdqa -112(%rdi),%xmm1
+ movdqa -96(%rdi),%xmm2
+ pand 256(%r10),%xmm0
+ movdqa -80(%rdi),%xmm3
+ pand 272(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 288(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 304(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%rdi),%xmm0
+ movdqa -48(%rdi),%xmm1
+ movdqa -32(%rdi),%xmm2
+ pand 320(%r10),%xmm0
+ movdqa -16(%rdi),%xmm3
+ pand 336(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 352(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 368(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%rdi),%xmm0
+ movdqa 16(%rdi),%xmm1
+ movdqa 32(%rdi),%xmm2
+ pand 384(%r10),%xmm0
+ movdqa 48(%rdi),%xmm3
+ pand 400(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 416(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 432(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%rdi),%xmm0
+ movdqa 80(%rdi),%xmm1
+ movdqa 96(%rdi),%xmm2
+ pand 448(%r10),%xmm0
+ movdqa 112(%rdi),%xmm3
+ pand 464(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 480(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 496(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+
+ movq %rbp,(%rbx)
+ leaq 32(%rbx,%rax,1),%rbx
+ mulxq 0(%rsi),%r8,%r11
+ xorq %rbp,%rbp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ mulxq 24(%rsi),%rdx,%r14
+ adoxq -16(%rbx),%r12
+ adcxq %rdx,%r13
+ leaq (%rcx,%rax,1),%rcx
+ leaq 32(%rsi),%rsi
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ adoxq %rbp,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+
+ movq %r8,%rdx
+ xorq %rbp,%rbp
+ movq %rdi,8+8(%rsp)
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-24(%rbx)
+ adoxq %rbp,%r15
+ movq %r12,-16(%rbx)
+ leaq 32(%rcx),%rcx
+ jmp L$mulx4x_inner
+
+.p2align 5
+L$mulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ movq %r11,-32(%rbx)
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ leaq 32(%rcx),%rcx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz L$mulx4x_inner
+
+ movq 0+8(%rsp),%rax
+ adcq %rbp,%r15
+ subq 0(%rbx),%rdi
+ movq 8+8(%rsp),%rdi
+ movq 16+8(%rsp),%r10
+ adcq %r15,%r14
+ leaq (%rsi,%rax,1),%rsi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+
+ cmpq %r10,%rdi
+ jb L$mulx4x_outer
+
+ movq -8(%rcx),%r10
+ movq %rbp,%r8
+ movq (%rcx,%rax,1),%r12
+ leaq (%rcx,%rax,1),%rbp
+ movq %rax,%rcx
+ leaq (%rbx,%rax,1),%rdi
+ xorl %eax,%eax
+ xorq %r15,%r15
+ subq %r14,%r10
+ adcq %r15,%r15
+ orq %r15,%r8
+ sarq $3+2,%rcx
+ subq %r8,%rax
+ movq 56+8(%rsp),%rdx
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqrx4x_sub_entry
+
+
+
+.p2align 5
+bn_powerx5:
+
+ movq %rsp,%rax
+
+L$powerx5_enter:
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+L$powerx5_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$pwrx_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp L$pwrx_sp_done
+
+.p2align 5
+L$pwrx_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+L$pwrx_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$pwrx_page_walk
+ jmp L$pwrx_page_walk_done
+
+L$pwrx_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$pwrx_page_walk
+L$pwrx_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+
+
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+
+L$powerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+
+ movq %r10,%r9
+ movq %rsi,%rdi
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq 40(%rsp),%rax
+
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$powerx5_epilogue:
+ ret
+
+
+
+.globl _bn_sqrx8x_internal
+.private_extern _bn_sqrx8x_internal
+.private_extern _bn_sqrx8x_internal
+
+.p2align 5
+_bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 48+8(%rsp),%rdi
+ leaq (%rsi,%r9,1),%rbp
+ movq %r9,0+8(%rsp)
+ movq %rbp,8+8(%rsp)
+ jmp L$sqr8x_zero_start
+
+.p2align 5
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+L$sqrx8x_zero:
+.byte 0x3e
+ movdqa %xmm0,0(%rdi)
+ movdqa %xmm0,16(%rdi)
+ movdqa %xmm0,32(%rdi)
+ movdqa %xmm0,48(%rdi)
+L$sqr8x_zero_start:
+ movdqa %xmm0,64(%rdi)
+ movdqa %xmm0,80(%rdi)
+ movdqa %xmm0,96(%rdi)
+ movdqa %xmm0,112(%rdi)
+ leaq 128(%rdi),%rdi
+ subq $64,%r9
+ jnz L$sqrx8x_zero
+
+ movq 0(%rsi),%rdx
+
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ leaq 48+8(%rsp),%rdi
+ xorq %rbp,%rbp
+ jmp L$sqrx8x_outer_loop
+
+.p2align 5
+L$sqrx8x_outer_loop:
+ mulxq 8(%rsi),%r8,%rax
+ adcxq %r9,%r8
+ adoxq %rax,%r10
+ mulxq 16(%rsi),%r9,%rax
+ adcxq %r10,%r9
+ adoxq %rax,%r11
+.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+ adcxq %r11,%r10
+ adoxq %rax,%r12
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+ adcxq %r12,%r11
+ adoxq %rax,%r13
+ mulxq 40(%rsi),%r12,%rax
+ adcxq %r13,%r12
+ adoxq %rax,%r14
+ mulxq 48(%rsi),%r13,%rax
+ adcxq %r14,%r13
+ adoxq %r15,%rax
+ mulxq 56(%rsi),%r14,%r15
+ movq 8(%rsi),%rdx
+ adcxq %rax,%r14
+ adoxq %rbp,%r15
+ adcq 64(%rdi),%r15
+ movq %r8,8(%rdi)
+ movq %r9,16(%rdi)
+ sbbq %rcx,%rcx
+ xorq %rbp,%rbp
+
+
+ mulxq 16(%rsi),%r8,%rbx
+ mulxq 24(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 32(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %rbx,%r11
+.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adcxq %r13,%r11
+ adoxq %r14,%r12
+.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+ movq 16(%rsi),%rdx
+ adcxq %rax,%r12
+ adoxq %rbx,%r13
+ adcxq %r15,%r13
+ adoxq %rbp,%r14
+ adcxq %rbp,%r14
+
+ movq %r8,24(%rdi)
+ movq %r9,32(%rdi)
+
+ mulxq 24(%rsi),%r8,%rbx
+ mulxq 32(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 40(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %r13,%r11
+.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte 0x3e
+ movq 24(%rsi),%rdx
+ adcxq %rbx,%r11
+ adoxq %rax,%r12
+ adcxq %r14,%r12
+ movq %r8,40(%rdi)
+ movq %r9,48(%rdi)
+ mulxq 32(%rsi),%r8,%rax
+ adoxq %rbp,%r13
+ adcxq %rbp,%r13
+
+ mulxq 40(%rsi),%r9,%rbx
+ adcxq %r10,%r8
+ adoxq %rax,%r9
+ mulxq 48(%rsi),%r10,%rax
+ adcxq %r11,%r9
+ adoxq %r12,%r10
+ mulxq 56(%rsi),%r11,%r12
+ movq 32(%rsi),%rdx
+ movq 40(%rsi),%r14
+ adcxq %rbx,%r10
+ adoxq %rax,%r11
+ movq 48(%rsi),%r15
+ adcxq %r13,%r11
+ adoxq %rbp,%r12
+ adcxq %rbp,%r12
+
+ movq %r8,56(%rdi)
+ movq %r9,64(%rdi)
+
+ mulxq %r14,%r9,%rax
+ movq 56(%rsi),%r8
+ adcxq %r10,%r9
+ mulxq %r15,%r10,%rbx
+ adoxq %rax,%r10
+ adcxq %r11,%r10
+ mulxq %r8,%r11,%rax
+ movq %r14,%rdx
+ adoxq %rbx,%r11
+ adcxq %r12,%r11
+
+ adcxq %rbp,%rax
+
+ mulxq %r15,%r14,%rbx
+ mulxq %r8,%r12,%r13
+ movq %r15,%rdx
+ leaq 64(%rsi),%rsi
+ adcxq %r14,%r11
+ adoxq %rbx,%r12
+ adcxq %rax,%r12
+ adoxq %rbp,%r13
+
+.byte 0x67,0x67
+ mulxq %r8,%r8,%r14
+ adcxq %r8,%r13
+ adcxq %rbp,%r14
+
+ cmpq 8+8(%rsp),%rsi
+ je L$sqrx8x_outer_break
+
+ negq %rcx
+ movq $-8,%rcx
+ movq %rbp,%r15
+ movq 64(%rdi),%r8
+ adcxq 72(%rdi),%r9
+ adcxq 80(%rdi),%r10
+ adcxq 88(%rdi),%r11
+ adcq 96(%rdi),%r12
+ adcq 104(%rdi),%r13
+ adcq 112(%rdi),%r14
+ adcq 120(%rdi),%r15
+ leaq (%rsi),%rbp
+ leaq 128(%rdi),%rdi
+ sbbq %rax,%rax
+
+ movq -64(%rsi),%rdx
+ movq %rax,16+8(%rsp)
+ movq %rdi,24+8(%rsp)
+
+
+ xorl %eax,%eax
+ jmp L$sqrx8x_loop
+
+.p2align 5
+L$sqrx8x_loop:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ movq %rbx,(%rdi,%rcx,8)
+ movl $0,%ebx
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+ movq 8(%rsi,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rbx,%r15
+ adcxq %rbx,%r15
+
+.byte 0x67
+ incq %rcx
+ jnz L$sqrx8x_loop
+
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ cmpq 8+8(%rsp),%rbp
+ je L$sqrx8x_break
+
+ subq 16+8(%rsp),%rbx
+.byte 0x66
+ movq -64(%rsi),%rdx
+ adcxq 0(%rdi),%r8
+ adcxq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+.byte 0x67
+ sbbq %rax,%rax
+ xorl %ebx,%ebx
+ movq %rax,16+8(%rsp)
+ jmp L$sqrx8x_loop
+
+.p2align 5
+L$sqrx8x_break:
+ xorq %rbp,%rbp
+ subq 16+8(%rsp),%rbx
+ adcxq %rbp,%r8
+ movq 24+8(%rsp),%rcx
+ adcxq %rbp,%r9
+ movq 0(%rsi),%rdx
+ adcq $0,%r10
+ movq %r8,0(%rdi)
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ cmpq %rcx,%rdi
+ je L$sqrx8x_outer_loop
+
+ movq %r9,8(%rdi)
+ movq 8(%rcx),%r9
+ movq %r10,16(%rdi)
+ movq 16(%rcx),%r10
+ movq %r11,24(%rdi)
+ movq 24(%rcx),%r11
+ movq %r12,32(%rdi)
+ movq 32(%rcx),%r12
+ movq %r13,40(%rdi)
+ movq 40(%rcx),%r13
+ movq %r14,48(%rdi)
+ movq 48(%rcx),%r14
+ movq %r15,56(%rdi)
+ movq 56(%rcx),%r15
+ movq %rcx,%rdi
+ jmp L$sqrx8x_outer_loop
+
+.p2align 5
+L$sqrx8x_outer_break:
+ movq %r9,72(%rdi)
+.byte 102,72,15,126,217
+ movq %r10,80(%rdi)
+ movq %r11,88(%rdi)
+ movq %r12,96(%rdi)
+ movq %r13,104(%rdi)
+ movq %r14,112(%rdi)
+ leaq 48+8(%rsp),%rdi
+ movq (%rsi,%rcx,1),%rdx
+
+ movq 8(%rdi),%r11
+ xorq %r10,%r10
+ movq 0+8(%rsp),%r9
+ adoxq %r11,%r11
+ movq 16(%rdi),%r12
+ movq 24(%rdi),%r13
+
+
+.p2align 5
+L$sqrx4x_shift_n_add:
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 40(%rdi),%r11
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ movq 16(%rsi,%rcx,1),%rdx
+ movq 48(%rdi),%r12
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 56(%rdi),%r13
+ movq %rax,16(%rdi)
+ movq %rbx,24(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+ movq 24(%rsi,%rcx,1),%rdx
+ leaq 32(%rcx),%rcx
+ movq 64(%rdi),%r10
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 72(%rdi),%r11
+ movq %rax,32(%rdi)
+ movq %rbx,40(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ jrcxz L$sqrx4x_shift_n_add_break
+.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 80(%rdi),%r12
+ movq 88(%rdi),%r13
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+ nop
+ jmp L$sqrx4x_shift_n_add
+
+.p2align 5
+L$sqrx4x_shift_n_add_break:
+ adcxq %r13,%rbx
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+.byte 102,72,15,126,213
+__bn_sqrx8x_reduction:
+ xorl %eax,%eax
+ movq 32+8(%rsp),%rbx
+ movq 48+8(%rsp),%rdx
+ leaq -64(%rbp,%r9,1),%rcx
+
+ movq %rcx,0+8(%rsp)
+ movq %rdi,8+8(%rsp)
+
+ leaq 48+8(%rsp),%rdi
+ jmp L$sqrx8x_reduction_loop
+
+.p2align 5
+L$sqrx8x_reduction_loop:
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq %rdx,%r8
+ imulq %rbx,%rdx
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,24+8(%rsp)
+
+ leaq 64(%rdi),%rdi
+ xorq %rsi,%rsi
+ movq $-8,%rcx
+ jmp L$sqrx8x_reduce
+
+.p2align 5
+L$sqrx8x_reduce:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rbx,%rax
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rbx,%r9
+ adcxq %rbx,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rbx,%r10
+ adcxq %rbx,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rbx,%r11
+ adcxq %rbx,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ movq %rdx,%rax
+ movq %r8,%rdx
+ adcxq %rbx,%r11
+ adoxq %r13,%r12
+
+ mulxq 32+8(%rsp),%rbx,%rdx
+ movq %rax,%rdx
+ movq %rax,64+48+8(%rsp,%rcx,8)
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq %rbx,%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ adcxq %rsi,%r15
+
+.byte 0x67,0x67,0x67
+ incq %rcx
+ jnz L$sqrx8x_reduce
+
+ movq %rsi,%rax
+ cmpq 0+8(%rsp),%rbp
+ jae L$sqrx8x_no_tail
+
+ movq 48+8(%rsp),%rdx
+ addq 0(%rdi),%r8
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ adcxq 8(%rdi),%r9
+ adcxq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp L$sqrx8x_tail
+
+.p2align 5
+L$sqrx8x_tail:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq 72+48+8(%rsp,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ movq %rbx,(%rdi,%rcx,8)
+ movq %r8,%rbx
+ adcxq %rsi,%r15
+
+ incq %rcx
+ jnz L$sqrx8x_tail
+
+ cmpq 0+8(%rsp),%rbp
+ jae L$sqrx8x_tail_done
+
+ subq 16+8(%rsp),%rsi
+ movq 48+8(%rsp),%rdx
+ leaq 64(%rbp),%rbp
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+ subq $8,%rcx
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp L$sqrx8x_tail
+
+.p2align 5
+L$sqrx8x_tail_done:
+ xorq %rax,%rax
+ addq 24+8(%rsp),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ subq 16+8(%rsp),%rsi
+L$sqrx8x_no_tail:
+ adcq 0(%rdi),%r8
+.byte 102,72,15,126,217
+ adcq 8(%rdi),%r9
+ movq 56(%rbp),%rsi
+.byte 102,72,15,126,213
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+
+ movq 32+8(%rsp),%rbx
+ movq 64(%rdi,%rcx,1),%rdx
+
+ movq %r8,0(%rdi)
+ leaq 64(%rdi),%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ leaq 64(%rdi,%rcx,1),%rdi
+ cmpq 8+8(%rsp),%r8
+ jb L$sqrx8x_reduction_loop
+ ret
+
+
+.p2align 5
+
+__bn_postx4x_internal:
+
+ movq 0(%rbp),%r12
+ movq %rcx,%r10
+ movq %rcx,%r9
+ negq %rax
+ sarq $3+2,%rcx
+
+.byte 102,72,15,126,202
+.byte 102,72,15,126,206
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp L$sqrx4x_sub_entry
+
+.p2align 4
+L$sqrx4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+L$sqrx4x_sub_entry:
+ andnq %rax,%r12,%r12
+ leaq 32(%rbp),%rbp
+ andnq %rax,%r13,%r13
+ andnq %rax,%r14,%r14
+ andnq %rax,%r15,%r15
+
+ negq %r8
+ adcq 0(%rdi),%r12
+ adcq 8(%rdi),%r13
+ adcq 16(%rdi),%r14
+ adcq 24(%rdi),%r15
+ movq %r12,0(%rdx)
+ leaq 32(%rdi),%rdi
+ movq %r13,8(%rdx)
+ sbbq %r8,%r8
+ movq %r14,16(%rdx)
+ movq %r15,24(%rdx)
+ leaq 32(%rdx),%rdx
+
+ incq %rcx
+ jnz L$sqrx4x_sub
+
+ negq %r9
+
+ ret
+
+
+.globl _bn_scatter5
+.private_extern _bn_scatter5
+
+.p2align 4
+_bn_scatter5:
+
+_CET_ENDBR
+ cmpl $0,%esi
+ jz L$scatter_epilogue
+
+
+
+
+
+
+
+
+
+ leaq (%rdx,%rcx,8),%rdx
+L$scatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subl $1,%esi
+ jnz L$scatter
+L$scatter_epilogue:
+ ret
+
+
+
+.globl _bn_gather5
+.private_extern _bn_gather5
+
+.p2align 5
+_bn_gather5:
+
+L$SEH_begin_bn_gather5:
+_CET_ENDBR
+
+.byte 0x4c,0x8d,0x14,0x24
+
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq L$inc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
+ jmp L$gather
+
+.p2align 5
+L$gather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subl $1,%esi
+ jnz L$gather
+
+ leaq (%r10),%rsp
+
+ ret
+L$SEH_end_bn_gather5:
+
+
+.section __DATA,__const
+.p2align 6
+L$inc:
+.long 0,0, 1,1
+.long 2,2, 2,2
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+#endif
diff --git a/gen/bcm/x86_64-mont5-linux.S b/gen/bcm/x86_64-mont5-linux.S
new file mode 100644
index 0000000..14ab4f7
--- /dev/null
+++ b/gen/bcm/x86_64-mont5-linux.S
@@ -0,0 +1,3625 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl bn_mul_mont_gather5
+.hidden bn_mul_mont_gather5
+.type bn_mul_mont_gather5,@function
+.align 64
+bn_mul_mont_gather5:
+.cfi_startproc
+_CET_ENDBR
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ testl $7,%r9d
+ jnz .Lmul_enter
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ movd 8(%rsp),%xmm5
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -280(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.Lmul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ leaq .Linc(%rip),%r10
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r9,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r9,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r9,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
+ xorq %r14,%r14
+ movq %r9,%r15
+
+.Lcopy:
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
+ movq %r14,(%rsp,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_epilogue:
+ ret
+.cfi_endproc
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type bn_mul4x_mont_gather5,@function
+.align 32
+bn_mul4x_mont_gather5:
+.cfi_startproc
+.byte 0x67
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmul4x_enter:
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lmulx4x_enter
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmul4x_prologue:
+
+.byte 0x67
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lmul4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lmul4xsp_done
+
+.align 32
+.Lmul4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lmul4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+ negq %r9
+
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmul4x_body:
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul4x_epilogue:
+ ret
+.cfi_endproc
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type mul4x_internal,@function
+.align 32
+mul4x_internal:
+.cfi_startproc
+ shlq $5,%r9
+ movd 8(%rax),%xmm5
+ leaq .Linc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5,%r9
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq %r13,16+8(%rsp)
+ movq %rdi,56+8(%rsp)
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+ leaq (%rsi,%r9,1),%rsi
+ negq %r9
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ leaq 64+8(%rsp),%r14
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+ jmp .L1st4x
+
+.align 32
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ leaq (%rcx,%r9,1),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ jmp .Louter4x
+
+.align 32
+.Louter4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq (%r14,%r9,1),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+ movq %rdi,(%r14)
+
+ leaq (%r14,%r9,1),%r14
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp .Linner4x
+
+.align 32
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ addq (%r14),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq %rbp,%rax
+ movq -8(%rcx),%rbp
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ movq %rdi,-16(%r14)
+ leaq (%rcx,%r9,1),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%r14),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ cmpq 16+8(%rsp),%r12
+ jb .Louter4x
+ xorq %rax,%rax
+ subq %r13,%rbp
+ adcq %r15,%r15
+ orq %r15,%rdi
+ subq %rdi,%rax
+ leaq (%r14,%r9,1),%rbx
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
+ movq %r9,%rcx
+ sarq $3+2,%rcx
+ movq 56+8(%rsp),%rdi
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
+.cfi_endproc
+.size mul4x_internal,.-mul4x_internal
+.globl bn_power5
+.hidden bn_power5
+.type bn_power5,@function
+.align 32
+bn_power5:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lpowerx5_enter
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lpower5_prologue:
+
+ shll $3,%r9d
+ leal (%r9,%r9,2),%r10d
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lpwr_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lpwr_sp_done
+
+.align 32
+.Lpwr_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lpwr_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwr_page_walk
+ jmp .Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpower5_body:
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq %rsi,%rdi
+ movq 40(%rsp),%rax
+ leaq 32(%rsp),%r8
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpower5_epilogue:
+ ret
+.cfi_endproc
+.size bn_power5,.-bn_power5
+
+.globl bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.type bn_sqr8x_internal,@function
+.align 32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+.cfi_startproc
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r11,-16(%rdi,%rbp,1)
+ movq %rdx,%r10
+
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ leaq (%rbp),%rcx
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp .Lsqr4x_1st
+
+.align 32
+.Lsqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq 16(%rsi,%rcx,1),%rbx
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %r10,8(%rdi,%rcx,1)
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 24(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,16(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+ leaq 32(%rcx),%rcx
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_1st
+
+ mulq %r15
+ addq %rax,%r13
+ leaq 16(%rbp),%rbp
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+ jmp .Lsqr4x_outer
+
+.align 32
+.Lsqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq -24(%rdi,%rbp,1),%r10
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r10,-24(%rdi,%rbp,1)
+ movq %rdx,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -16(%rdi,%rbp,1),%r11
+ movq %rdx,%r10
+ adcq $0,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ xorq %r12,%r12
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -8(%rdi,%rbp,1),%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rbp,1)
+
+ leaq (%rbp),%rcx
+ jmp .Lsqr4x_inner
+
+.align 32
+.Lsqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+
+.byte 0x67
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %r11,(%rdi,%rcx,1)
+ movq %rbx,%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ leaq 16(%rcx),%rcx
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_inner
+
+.byte 0x67
+ mulq %r15
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ addq $16,%rbp
+ jnz .Lsqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq %r10,-24(%rdi)
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ movq -8(%rsi),%rbx
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,-16(%rdi)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi)
+
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 48+8(%rsp),%rdi
+ xorq %r10,%r10
+ movq 8(%rdi),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ jmp .Lsqr4x_shift_n_add
+
+.align 32
+.Lsqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ addq $32,%rbp
+ jnz .Lsqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+.byte 0x67
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+.byte 102,72,15,126,213
+__bn_sqr8x_reduction:
+ xorq %rax,%rax
+ leaq (%r9,%rbp,1),%rcx
+ leaq 48+8(%rsp,%r9,2),%rdx
+ movq %rcx,0+8(%rsp)
+ leaq 48+8(%rsp,%r9,1),%rdi
+ movq %rdx,8+8(%rsp)
+ negq %r9
+ jmp .L8x_reduction_loop
+
+.align 32
+.L8x_reduction_loop:
+ leaq (%rdi,%r9,1),%rdi
+.byte 0x66
+ movq 0(%rdi),%rbx
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,(%rdx)
+ leaq 64(%rdi),%rdi
+
+.byte 0x67
+ movq %rbx,%r8
+ imulq 32+8(%rsp),%rbx
+ movq 0(%rbp),%rax
+ movl $8,%ecx
+ jmp .L8x_reduce
+
+.align 32
+.L8x_reduce:
+ mulq %rbx
+ movq 8(%rbp),%rax
+ negq %r8
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rbx,48-8+8(%rsp,%rcx,8)
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq 32+8(%rsp),%rsi
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ imulq %r8,%rsi
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq %rsi,%rbx
+ addq %rax,%r15
+ movq 0(%rbp),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz .L8x_reduce
+
+ leaq 64(%rbp),%rbp
+ xorq %rax,%rax
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae .L8x_no_tail
+
+.byte 0x66
+ addq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movq 48+56+8(%rsp),%rbx
+ movl $8,%ecx
+ movq 0(%rbp),%rax
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail:
+ mulq %rbx
+ addq %rax,%r8
+ movq 8(%rbp),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ leaq 8(%rdi),%rdi
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq 48-16+8(%rsp,%rcx,8),%rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq 0(%rbp),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz .L8x_tail
+
+ leaq 64(%rbp),%rbp
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae .L8x_tail_done
+
+ movq 48+56+8(%rsp),%rbx
+ negq %rsi
+ movq 0(%rbp),%rax
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movl $8,%ecx
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail_done:
+ xorq %rax,%rax
+ addq (%rdx),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ negq %rsi
+.L8x_no_tail:
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+ movq -8(%rbp),%rcx
+ xorq %rsi,%rsi
+
+.byte 102,72,15,126,213
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+.byte 102,73,15,126,217
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+ leaq 64(%rdi),%rdi
+
+ cmpq %rdx,%rdi
+ jb .L8x_reduction_loop
+ ret
+.cfi_endproc
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.type __bn_post4x_internal,@function
+.align 32
+__bn_post4x_internal:
+.cfi_startproc
+ movq 0(%rbp),%r12
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+.byte 102,72,15,126,207
+ negq %rax
+.byte 102,72,15,126,206
+ sarq $3+2,%rcx
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
+
+.align 16
+.Lsqr4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
+ movq %r12,0(%rdi)
+ leaq 32(%rbx),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r10,%r10
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+
+ incq %rcx
+ jnz .Lsqr4x_sub
+
+ movq %r9,%r10
+ negq %r9
+ ret
+.cfi_endproc
+.size __bn_post4x_internal,.-__bn_post4x_internal
+.type bn_mulx4x_mont_gather5,@function
+.align 32
+bn_mulx4x_mont_gather5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lmulx4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lmulx4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmulx4x_body:
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ ret
+.cfi_endproc
+.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type mulx4x_internal,@function
+.align 32
+mulx4x_internal:
+.cfi_startproc
+ movq %r9,8(%rsp)
+ movq %r9,%r10
+ negq %r9
+ shlq $5,%r9
+ negq %r10
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5+5,%r9
+ movd 8(%rax),%xmm5
+ subq $1,%r9
+ leaq .Linc(%rip),%rax
+ movq %r13,16+8(%rsp)
+ movq %r9,24+8(%rsp)
+ movq %rdi,56+8(%rsp)
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r10,1),%r10
+ leaq 128(%rdx),%rdi
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67
+ movdqa %xmm1,%xmm2
+.byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+.byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+
+ pand 64(%rdi),%xmm0
+ pand 80(%rdi),%xmm1
+ pand 96(%rdi),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%rdi),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%rdi),%xmm4
+ movdqa -112(%rdi),%xmm5
+ movdqa -96(%rdi),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%rdi),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%rdi),%xmm4
+ movdqa -48(%rdi),%xmm5
+ movdqa -32(%rdi),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%rdi),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%rdi),%xmm4
+ movdqa 16(%rdi),%xmm5
+ movdqa 32(%rdi),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%rdi),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ pxor %xmm1,%xmm0
+
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+ leaq 64+32+8(%rsp),%rbx
+
+ movq %rdx,%r9
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r12
+ addq %rax,%r11
+ mulxq 16(%rsi),%rax,%r13
+ adcq %rax,%r12
+ adcq $0,%r13
+ mulxq 24(%rsi),%rax,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+ xorq %rbp,%rbp
+ movq %r8,%rdx
+
+ movq %rdi,8+8(%rsp)
+
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 8(%rsp),%rax
+ adcq %rbp,%r15
+ leaq (%rsi,%rax,1),%rsi
+ addq %r15,%r14
+ movq 8+8(%rsp),%rdi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ leaq 16-256(%rbx),%r10
+ pxor %xmm4,%xmm4
+.byte 0x67,0x67
+ pxor %xmm5,%xmm5
+ movdqa -128(%rdi),%xmm0
+ movdqa -112(%rdi),%xmm1
+ movdqa -96(%rdi),%xmm2
+ pand 256(%r10),%xmm0
+ movdqa -80(%rdi),%xmm3
+ pand 272(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 288(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 304(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%rdi),%xmm0
+ movdqa -48(%rdi),%xmm1
+ movdqa -32(%rdi),%xmm2
+ pand 320(%r10),%xmm0
+ movdqa -16(%rdi),%xmm3
+ pand 336(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 352(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 368(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%rdi),%xmm0
+ movdqa 16(%rdi),%xmm1
+ movdqa 32(%rdi),%xmm2
+ pand 384(%r10),%xmm0
+ movdqa 48(%rdi),%xmm3
+ pand 400(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 416(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 432(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%rdi),%xmm0
+ movdqa 80(%rdi),%xmm1
+ movdqa 96(%rdi),%xmm2
+ pand 448(%r10),%xmm0
+ movdqa 112(%rdi),%xmm3
+ pand 464(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 480(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 496(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+
+ movq %rbp,(%rbx)
+ leaq 32(%rbx,%rax,1),%rbx
+ mulxq 0(%rsi),%r8,%r11
+ xorq %rbp,%rbp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ mulxq 24(%rsi),%rdx,%r14
+ adoxq -16(%rbx),%r12
+ adcxq %rdx,%r13
+ leaq (%rcx,%rax,1),%rcx
+ leaq 32(%rsi),%rsi
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ adoxq %rbp,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+
+ movq %r8,%rdx
+ xorq %rbp,%rbp
+ movq %rdi,8+8(%rsp)
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-24(%rbx)
+ adoxq %rbp,%r15
+ movq %r12,-16(%rbx)
+ leaq 32(%rcx),%rcx
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ movq %r11,-32(%rbx)
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ leaq 32(%rcx),%rcx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0+8(%rsp),%rax
+ adcq %rbp,%r15
+ subq 0(%rbx),%rdi
+ movq 8+8(%rsp),%rdi
+ movq 16+8(%rsp),%r10
+ adcq %r15,%r14
+ leaq (%rsi,%rax,1),%rsi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+
+ cmpq %r10,%rdi
+ jb .Lmulx4x_outer
+
+ movq -8(%rcx),%r10
+ movq %rbp,%r8
+ movq (%rcx,%rax,1),%r12
+ leaq (%rcx,%rax,1),%rbp
+ movq %rax,%rcx
+ leaq (%rbx,%rax,1),%rdi
+ xorl %eax,%eax
+ xorq %r15,%r15
+ subq %r14,%r10
+ adcq %r15,%r15
+ orq %r15,%r8
+ sarq $3+2,%rcx
+ subq %r8,%rax
+ movq 56+8(%rsp),%rdx
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+.cfi_endproc
+.size mulx4x_internal,.-mulx4x_internal
+.type bn_powerx5,@function
+.align 32
+bn_powerx5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lpowerx5_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lpowerx5_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lpwrx_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lpwrx_sp_done
+
+.align 32
+.Lpwrx_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lpwrx_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+
+
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpowerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+
+ movq %r10,%r9
+ movq %rsi,%rdi
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq 40(%rsp),%rax
+
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpowerx5_epilogue:
+ ret
+.cfi_endproc
+.size bn_powerx5,.-bn_powerx5
+
+.globl bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.type bn_sqrx8x_internal,@function
+.align 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+.cfi_startproc
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 48+8(%rsp),%rdi
+ leaq (%rsi,%r9,1),%rbp
+ movq %r9,0+8(%rsp)
+ movq %rbp,8+8(%rsp)
+ jmp .Lsqr8x_zero_start
+
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte 0x3e
+ movdqa %xmm0,0(%rdi)
+ movdqa %xmm0,16(%rdi)
+ movdqa %xmm0,32(%rdi)
+ movdqa %xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+ movdqa %xmm0,64(%rdi)
+ movdqa %xmm0,80(%rdi)
+ movdqa %xmm0,96(%rdi)
+ movdqa %xmm0,112(%rdi)
+ leaq 128(%rdi),%rdi
+ subq $64,%r9
+ jnz .Lsqrx8x_zero
+
+ movq 0(%rsi),%rdx
+
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ leaq 48+8(%rsp),%rdi
+ xorq %rbp,%rbp
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_loop:
+ mulxq 8(%rsi),%r8,%rax
+ adcxq %r9,%r8
+ adoxq %rax,%r10
+ mulxq 16(%rsi),%r9,%rax
+ adcxq %r10,%r9
+ adoxq %rax,%r11
+.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+ adcxq %r11,%r10
+ adoxq %rax,%r12
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+ adcxq %r12,%r11
+ adoxq %rax,%r13
+ mulxq 40(%rsi),%r12,%rax
+ adcxq %r13,%r12
+ adoxq %rax,%r14
+ mulxq 48(%rsi),%r13,%rax
+ adcxq %r14,%r13
+ adoxq %r15,%rax
+ mulxq 56(%rsi),%r14,%r15
+ movq 8(%rsi),%rdx
+ adcxq %rax,%r14
+ adoxq %rbp,%r15
+ adcq 64(%rdi),%r15
+ movq %r8,8(%rdi)
+ movq %r9,16(%rdi)
+ sbbq %rcx,%rcx
+ xorq %rbp,%rbp
+
+
+ mulxq 16(%rsi),%r8,%rbx
+ mulxq 24(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 32(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %rbx,%r11
+.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adcxq %r13,%r11
+ adoxq %r14,%r12
+.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+ movq 16(%rsi),%rdx
+ adcxq %rax,%r12
+ adoxq %rbx,%r13
+ adcxq %r15,%r13
+ adoxq %rbp,%r14
+ adcxq %rbp,%r14
+
+ movq %r8,24(%rdi)
+ movq %r9,32(%rdi)
+
+ mulxq 24(%rsi),%r8,%rbx
+ mulxq 32(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 40(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %r13,%r11
+.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte 0x3e
+ movq 24(%rsi),%rdx
+ adcxq %rbx,%r11
+ adoxq %rax,%r12
+ adcxq %r14,%r12
+ movq %r8,40(%rdi)
+ movq %r9,48(%rdi)
+ mulxq 32(%rsi),%r8,%rax
+ adoxq %rbp,%r13
+ adcxq %rbp,%r13
+
+ mulxq 40(%rsi),%r9,%rbx
+ adcxq %r10,%r8
+ adoxq %rax,%r9
+ mulxq 48(%rsi),%r10,%rax
+ adcxq %r11,%r9
+ adoxq %r12,%r10
+ mulxq 56(%rsi),%r11,%r12
+ movq 32(%rsi),%rdx
+ movq 40(%rsi),%r14
+ adcxq %rbx,%r10
+ adoxq %rax,%r11
+ movq 48(%rsi),%r15
+ adcxq %r13,%r11
+ adoxq %rbp,%r12
+ adcxq %rbp,%r12
+
+ movq %r8,56(%rdi)
+ movq %r9,64(%rdi)
+
+ mulxq %r14,%r9,%rax
+ movq 56(%rsi),%r8
+ adcxq %r10,%r9
+ mulxq %r15,%r10,%rbx
+ adoxq %rax,%r10
+ adcxq %r11,%r10
+ mulxq %r8,%r11,%rax
+ movq %r14,%rdx
+ adoxq %rbx,%r11
+ adcxq %r12,%r11
+
+ adcxq %rbp,%rax
+
+ mulxq %r15,%r14,%rbx
+ mulxq %r8,%r12,%r13
+ movq %r15,%rdx
+ leaq 64(%rsi),%rsi
+ adcxq %r14,%r11
+ adoxq %rbx,%r12
+ adcxq %rax,%r12
+ adoxq %rbp,%r13
+
+.byte 0x67,0x67
+ mulxq %r8,%r8,%r14
+ adcxq %r8,%r13
+ adcxq %rbp,%r14
+
+ cmpq 8+8(%rsp),%rsi
+ je .Lsqrx8x_outer_break
+
+ negq %rcx
+ movq $-8,%rcx
+ movq %rbp,%r15
+ movq 64(%rdi),%r8
+ adcxq 72(%rdi),%r9
+ adcxq 80(%rdi),%r10
+ adcxq 88(%rdi),%r11
+ adcq 96(%rdi),%r12
+ adcq 104(%rdi),%r13
+ adcq 112(%rdi),%r14
+ adcq 120(%rdi),%r15
+ leaq (%rsi),%rbp
+ leaq 128(%rdi),%rdi
+ sbbq %rax,%rax
+
+ movq -64(%rsi),%rdx
+ movq %rax,16+8(%rsp)
+ movq %rdi,24+8(%rsp)
+
+
+ xorl %eax,%eax
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_loop:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ movq %rbx,(%rdi,%rcx,8)
+ movl $0,%ebx
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+ movq 8(%rsi,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rbx,%r15
+ adcxq %rbx,%r15
+
+.byte 0x67
+ incq %rcx
+ jnz .Lsqrx8x_loop
+
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ cmpq 8+8(%rsp),%rbp
+ je .Lsqrx8x_break
+
+ subq 16+8(%rsp),%rbx
+.byte 0x66
+ movq -64(%rsi),%rdx
+ adcxq 0(%rdi),%r8
+ adcxq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+.byte 0x67
+ sbbq %rax,%rax
+ xorl %ebx,%ebx
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_break:
+ xorq %rbp,%rbp
+ subq 16+8(%rsp),%rbx
+ adcxq %rbp,%r8
+ movq 24+8(%rsp),%rcx
+ adcxq %rbp,%r9
+ movq 0(%rsi),%rdx
+ adcq $0,%r10
+ movq %r8,0(%rdi)
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ cmpq %rcx,%rdi
+ je .Lsqrx8x_outer_loop
+
+ movq %r9,8(%rdi)
+ movq 8(%rcx),%r9
+ movq %r10,16(%rdi)
+ movq 16(%rcx),%r10
+ movq %r11,24(%rdi)
+ movq 24(%rcx),%r11
+ movq %r12,32(%rdi)
+ movq 32(%rcx),%r12
+ movq %r13,40(%rdi)
+ movq 40(%rcx),%r13
+ movq %r14,48(%rdi)
+ movq 48(%rcx),%r14
+ movq %r15,56(%rdi)
+ movq 56(%rcx),%r15
+ movq %rcx,%rdi
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_break:
+ movq %r9,72(%rdi)
+.byte 102,72,15,126,217
+ movq %r10,80(%rdi)
+ movq %r11,88(%rdi)
+ movq %r12,96(%rdi)
+ movq %r13,104(%rdi)
+ movq %r14,112(%rdi)
+ leaq 48+8(%rsp),%rdi
+ movq (%rsi,%rcx,1),%rdx
+
+ movq 8(%rdi),%r11
+ xorq %r10,%r10
+ movq 0+8(%rsp),%r9
+ adoxq %r11,%r11
+ movq 16(%rdi),%r12
+ movq 24(%rdi),%r13
+
+
+.align 32
+.Lsqrx4x_shift_n_add:
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 40(%rdi),%r11
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ movq 16(%rsi,%rcx,1),%rdx
+ movq 48(%rdi),%r12
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 56(%rdi),%r13
+ movq %rax,16(%rdi)
+ movq %rbx,24(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+ movq 24(%rsi,%rcx,1),%rdx
+ leaq 32(%rcx),%rcx
+ movq 64(%rdi),%r10
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 72(%rdi),%r11
+ movq %rax,32(%rdi)
+ movq %rbx,40(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ jrcxz .Lsqrx4x_shift_n_add_break
+.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 80(%rdi),%r12
+ movq 88(%rdi),%r13
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+ nop
+ jmp .Lsqrx4x_shift_n_add
+
+.align 32
+.Lsqrx4x_shift_n_add_break:
+ adcxq %r13,%rbx
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+.byte 102,72,15,126,213
+__bn_sqrx8x_reduction:
+ xorl %eax,%eax
+ movq 32+8(%rsp),%rbx
+ movq 48+8(%rsp),%rdx
+ leaq -64(%rbp,%r9,1),%rcx
+
+ movq %rcx,0+8(%rsp)
+ movq %rdi,8+8(%rsp)
+
+ leaq 48+8(%rsp),%rdi
+ jmp .Lsqrx8x_reduction_loop
+
+.align 32
+.Lsqrx8x_reduction_loop:
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq %rdx,%r8
+ imulq %rbx,%rdx
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,24+8(%rsp)
+
+ leaq 64(%rdi),%rdi
+ xorq %rsi,%rsi
+ movq $-8,%rcx
+ jmp .Lsqrx8x_reduce
+
+.align 32
+.Lsqrx8x_reduce:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rbx,%rax
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rbx,%r9
+ adcxq %rbx,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rbx,%r10
+ adcxq %rbx,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rbx,%r11
+ adcxq %rbx,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ movq %rdx,%rax
+ movq %r8,%rdx
+ adcxq %rbx,%r11
+ adoxq %r13,%r12
+
+ mulxq 32+8(%rsp),%rbx,%rdx
+ movq %rax,%rdx
+ movq %rax,64+48+8(%rsp,%rcx,8)
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq %rbx,%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ adcxq %rsi,%r15
+
+.byte 0x67,0x67,0x67
+ incq %rcx
+ jnz .Lsqrx8x_reduce
+
+ movq %rsi,%rax
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_no_tail
+
+ movq 48+8(%rsp),%rdx
+ addq 0(%rdi),%r8
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ adcxq 8(%rdi),%r9
+ adcxq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq 72+48+8(%rsp,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ movq %rbx,(%rdi,%rcx,8)
+ movq %r8,%rbx
+ adcxq %rsi,%r15
+
+ incq %rcx
+ jnz .Lsqrx8x_tail
+
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_tail_done
+
+ subq 16+8(%rsp),%rsi
+ movq 48+8(%rsp),%rdx
+ leaq 64(%rbp),%rbp
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+ subq $8,%rcx
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail_done:
+ xorq %rax,%rax
+ addq 24+8(%rsp),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ subq 16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+ adcq 0(%rdi),%r8
+.byte 102,72,15,126,217
+ adcq 8(%rdi),%r9
+ movq 56(%rbp),%rsi
+.byte 102,72,15,126,213
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+
+ movq 32+8(%rsp),%rbx
+ movq 64(%rdi,%rcx,1),%rdx
+
+ movq %r8,0(%rdi)
+ leaq 64(%rdi),%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ leaq 64(%rdi,%rcx,1),%rdi
+ cmpq 8+8(%rsp),%r8
+ jb .Lsqrx8x_reduction_loop
+ ret
+.cfi_endproc
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align 32
+.type __bn_postx4x_internal,@function
+__bn_postx4x_internal:
+.cfi_startproc
+ movq 0(%rbp),%r12
+ movq %rcx,%r10
+ movq %rcx,%r9
+ negq %rax
+ sarq $3+2,%rcx
+
+.byte 102,72,15,126,202
+.byte 102,72,15,126,206
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+
+.align 16
+.Lsqrx4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+ andnq %rax,%r12,%r12
+ leaq 32(%rbp),%rbp
+ andnq %rax,%r13,%r13
+ andnq %rax,%r14,%r14
+ andnq %rax,%r15,%r15
+
+ negq %r8
+ adcq 0(%rdi),%r12
+ adcq 8(%rdi),%r13
+ adcq 16(%rdi),%r14
+ adcq 24(%rdi),%r15
+ movq %r12,0(%rdx)
+ leaq 32(%rdi),%rdi
+ movq %r13,8(%rdx)
+ sbbq %r8,%r8
+ movq %r14,16(%rdx)
+ movq %r15,24(%rdx)
+ leaq 32(%rdx),%rdx
+
+ incq %rcx
+ jnz .Lsqrx4x_sub
+
+ negq %r9
+
+ ret
+.cfi_endproc
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
+.globl bn_scatter5
+.hidden bn_scatter5
+.type bn_scatter5,@function
+.align 16
+bn_scatter5:
+.cfi_startproc
+_CET_ENDBR
+ cmpl $0,%esi
+ jz .Lscatter_epilogue
+
+
+
+
+
+
+
+
+
+ leaq (%rdx,%rcx,8),%rdx
+.Lscatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subl $1,%esi
+ jnz .Lscatter
+.Lscatter_epilogue:
+ ret
+.cfi_endproc
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.hidden bn_gather5
+.type bn_gather5,@function
+.align 32
+bn_gather5:
+.cfi_startproc
+.LSEH_begin_bn_gather5:
+_CET_ENDBR
+
+.byte 0x4c,0x8d,0x14,0x24
+.cfi_def_cfa_register %r10
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq .Linc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
+ jmp .Lgather
+
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subl $1,%esi
+ jnz .Lgather
+
+ leaq (%r10),%rsp
+.cfi_def_cfa_register %rsp
+ ret
+.LSEH_end_bn_gather5:
+.cfi_endproc
+.size bn_gather5,.-bn_gather5
+.section .rodata
+.align 64
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+#endif
diff --git a/gen/bcm/x86_64-mont5-win.asm b/gen/bcm/x86_64-mont5-win.asm
new file mode 100644
index 0000000..46aae51
--- /dev/null
+++ b/gen/bcm/x86_64-mont5-win.asm
@@ -0,0 +1,3864 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+EXTERN OPENSSL_ia32cap_P
+
+global bn_mul_mont_gather5
+
+ALIGN 64
+bn_mul_mont_gather5:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_mul_mont_gather5:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ mov r9d,r9d
+ mov rax,rsp
+
+ test r9d,7
+ jnz NEAR $L$mul_enter
+ lea r11,[OPENSSL_ia32cap_P]
+ mov r11d,DWORD[8+r11]
+ jmp NEAR $L$mul4x_enter
+
+ALIGN 16
+$L$mul_enter:
+ movd xmm5,DWORD[56+rsp]
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+
+ neg r9
+ mov r11,rsp
+ lea r10,[((-280))+r9*8+rsp]
+ neg r9
+ and r10,-1024
+
+
+
+
+
+
+
+
+
+ sub r11,r10
+ and r11,-4096
+ lea rsp,[r11*1+r10]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+ jmp NEAR $L$mul_page_walk_done
+
+$L$mul_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+ lea r10,[$L$inc]
+ mov QWORD[8+r9*8+rsp],rax
+
+$L$mul_body:
+
+ lea r12,[128+rdx]
+ movdqa xmm0,XMMWORD[r10]
+ movdqa xmm1,XMMWORD[16+r10]
+ lea r10,[((24-112))+r9*8+rsp]
+ and r10,-16
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ DB 0x67
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[288+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[304+r10],xmm0
+
+ paddd xmm3,xmm2
+ DB 0x67
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[336+r10],xmm2
+ pand xmm0,XMMWORD[64+r12]
+
+ pand xmm1,XMMWORD[80+r12]
+ pand xmm2,XMMWORD[96+r12]
+ movdqa XMMWORD[352+r10],xmm3
+ pand xmm3,XMMWORD[112+r12]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[((-128))+r12]
+ movdqa xmm5,XMMWORD[((-112))+r12]
+ movdqa xmm2,XMMWORD[((-96))+r12]
+ pand xmm4,XMMWORD[112+r10]
+ movdqa xmm3,XMMWORD[((-80))+r12]
+ pand xmm5,XMMWORD[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[160+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[((-64))+r12]
+ movdqa xmm5,XMMWORD[((-48))+r12]
+ movdqa xmm2,XMMWORD[((-32))+r12]
+ pand xmm4,XMMWORD[176+r10]
+ movdqa xmm3,XMMWORD[((-16))+r12]
+ pand xmm5,XMMWORD[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[r12]
+ movdqa xmm5,XMMWORD[16+r12]
+ movdqa xmm2,XMMWORD[32+r12]
+ pand xmm4,XMMWORD[240+r10]
+ movdqa xmm3,XMMWORD[48+r12]
+ pand xmm5,XMMWORD[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ por xmm0,xmm1
+
+ pshufd xmm1,xmm0,0x4e
+ por xmm0,xmm1
+ lea r12,[256+r12]
+DB 102,72,15,126,195
+
+ mov r8,QWORD[r8]
+ mov rax,QWORD[rsi]
+
+ xor r14,r14
+ xor r15,r15
+
+ mov rbp,r8
+ mul rbx
+ mov r10,rax
+ mov rax,QWORD[rcx]
+
+ imul rbp,r10
+ mov r11,rdx
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+rsi]
+ adc rdx,0
+ mov r13,rdx
+
+ lea r15,[1+r15]
+ jmp NEAR $L$1st_enter
+
+ALIGN 16
+$L$1st:
+ add r13,rax
+ mov rax,QWORD[r15*8+rsi]
+ adc rdx,0
+ add r13,r11
+ mov r11,r10
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],r13
+ mov r13,rdx
+
+$L$1st_enter:
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[r15*8+rcx]
+ adc rdx,0
+ lea r15,[1+r15]
+ mov r10,rdx
+
+ mul rbp
+ cmp r15,r9
+ jne NEAR $L$1st
+
+
+ add r13,rax
+ adc rdx,0
+ add r13,r11
+ adc rdx,0
+ mov QWORD[((-16))+r9*8+rsp],r13
+ mov r13,rdx
+ mov r11,r10
+
+ xor rdx,rdx
+ add r13,r11
+ adc rdx,0
+ mov QWORD[((-8))+r9*8+rsp],r13
+ mov QWORD[r9*8+rsp],rdx
+
+ lea r14,[1+r14]
+ jmp NEAR $L$outer
+ALIGN 16
+$L$outer:
+ lea rdx,[((24+128))+r9*8+rsp]
+ and rdx,-16
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD[((-128))+r12]
+ movdqa xmm1,XMMWORD[((-112))+r12]
+ movdqa xmm2,XMMWORD[((-96))+r12]
+ movdqa xmm3,XMMWORD[((-80))+r12]
+ pand xmm0,XMMWORD[((-128))+rdx]
+ pand xmm1,XMMWORD[((-112))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[((-96))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[((-80))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[((-64))+r12]
+ movdqa xmm1,XMMWORD[((-48))+r12]
+ movdqa xmm2,XMMWORD[((-32))+r12]
+ movdqa xmm3,XMMWORD[((-16))+r12]
+ pand xmm0,XMMWORD[((-64))+rdx]
+ pand xmm1,XMMWORD[((-48))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[((-32))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[((-16))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[r12]
+ movdqa xmm1,XMMWORD[16+r12]
+ movdqa xmm2,XMMWORD[32+r12]
+ movdqa xmm3,XMMWORD[48+r12]
+ pand xmm0,XMMWORD[rdx]
+ pand xmm1,XMMWORD[16+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[32+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[48+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[64+r12]
+ movdqa xmm1,XMMWORD[80+r12]
+ movdqa xmm2,XMMWORD[96+r12]
+ movdqa xmm3,XMMWORD[112+r12]
+ pand xmm0,XMMWORD[64+rdx]
+ pand xmm1,XMMWORD[80+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[96+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[112+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+
+ pshufd xmm0,xmm4,0x4e
+ por xmm0,xmm4
+ lea r12,[256+r12]
+
+ mov rax,QWORD[rsi]
+DB 102,72,15,126,195
+
+ xor r15,r15
+ mov rbp,r8
+ mov r10,QWORD[rsp]
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[rcx]
+ adc rdx,0
+
+ imul rbp,r10
+ mov r11,rdx
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+rsi]
+ adc rdx,0
+ mov r10,QWORD[8+rsp]
+ mov r13,rdx
+
+ lea r15,[1+r15]
+ jmp NEAR $L$inner_enter
+
+ALIGN 16
+$L$inner:
+ add r13,rax
+ mov rax,QWORD[r15*8+rsi]
+ adc rdx,0
+ add r13,r10
+ mov r10,QWORD[r15*8+rsp]
+ adc rdx,0
+ mov QWORD[((-16))+r15*8+rsp],r13
+ mov r13,rdx
+
+$L$inner_enter:
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[r15*8+rcx]
+ adc rdx,0
+ add r10,r11
+ mov r11,rdx
+ adc r11,0
+ lea r15,[1+r15]
+
+ mul rbp
+ cmp r15,r9
+ jne NEAR $L$inner
+
+ add r13,rax
+ adc rdx,0
+ add r13,r10
+ mov r10,QWORD[r9*8+rsp]
+ adc rdx,0
+ mov QWORD[((-16))+r9*8+rsp],r13
+ mov r13,rdx
+
+ xor rdx,rdx
+ add r13,r11
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-8))+r9*8+rsp],r13
+ mov QWORD[r9*8+rsp],rdx
+
+ lea r14,[1+r14]
+ cmp r14,r9
+ jb NEAR $L$outer
+
+ xor r14,r14
+ mov rax,QWORD[rsp]
+ lea rsi,[rsp]
+ mov r15,r9
+ jmp NEAR $L$sub
+ALIGN 16
+$L$sub: sbb rax,QWORD[r14*8+rcx]
+ mov QWORD[r14*8+rdi],rax
+ mov rax,QWORD[8+r14*8+rsi]
+ lea r14,[1+r14]
+ dec r15
+ jnz NEAR $L$sub
+
+ sbb rax,0
+ mov rbx,-1
+ xor rbx,rax
+ xor r14,r14
+ mov r15,r9
+
+$L$copy:
+ mov rcx,QWORD[r14*8+rdi]
+ mov rdx,QWORD[r14*8+rsp]
+ and rcx,rbx
+ and rdx,rax
+ mov QWORD[r14*8+rsp],r14
+ or rdx,rcx
+ mov QWORD[r14*8+rdi],rdx
+ lea r14,[1+r14]
+ sub r15,1
+ jnz NEAR $L$copy
+
+ mov rsi,QWORD[8+r9*8+rsp]
+
+ mov rax,1
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$mul_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_mul_mont_gather5:
+
+ALIGN 32
+bn_mul4x_mont_gather5:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_mul4x_mont_gather5:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+ DB 0x67
+ mov rax,rsp
+
+$L$mul4x_enter:
+ and r11d,0x80108
+ cmp r11d,0x80108
+ je NEAR $L$mulx4x_enter
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$mul4x_prologue:
+
+ DB 0x67
+ shl r9d,3
+ lea r10,[r9*2+r9]
+ neg r9
+
+
+
+
+
+
+
+
+
+
+ lea r11,[((-320))+r9*2+rsp]
+ mov rbp,rsp
+ sub r11,rdi
+ and r11,4095
+ cmp r10,r11
+ jb NEAR $L$mul4xsp_alt
+ sub rbp,r11
+ lea rbp,[((-320))+r9*2+rbp]
+ jmp NEAR $L$mul4xsp_done
+
+ALIGN 32
+$L$mul4xsp_alt:
+ lea r10,[((4096-320))+r9*2]
+ lea rbp,[((-320))+r9*2+rbp]
+ sub r11,r10
+ mov r10,0
+ cmovc r11,r10
+ sub rbp,r11
+$L$mul4xsp_done:
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mul4x_page_walk
+ jmp NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+ neg r9
+
+ mov QWORD[40+rsp],rax
+
+$L$mul4x_body:
+
+ call mul4x_internal
+
+ mov rsi,QWORD[40+rsp]
+
+ mov rax,1
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$mul4x_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_mul4x_mont_gather5:
+
+
+ALIGN 32
+mul4x_internal:
+
+ shl r9,5
+ movd xmm5,DWORD[56+rax]
+ lea rax,[$L$inc]
+ lea r13,[128+r9*1+rdx]
+ shr r9,5
+ movdqa xmm0,XMMWORD[rax]
+ movdqa xmm1,XMMWORD[16+rax]
+ lea r10,[((88-112))+r9*1+rsp]
+ lea r12,[128+rdx]
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ DB 0x67,0x67
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ DB 0x67
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[288+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[304+r10],xmm0
+
+ paddd xmm3,xmm2
+ DB 0x67
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[336+r10],xmm2
+ pand xmm0,XMMWORD[64+r12]
+
+ pand xmm1,XMMWORD[80+r12]
+ pand xmm2,XMMWORD[96+r12]
+ movdqa XMMWORD[352+r10],xmm3
+ pand xmm3,XMMWORD[112+r12]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[((-128))+r12]
+ movdqa xmm5,XMMWORD[((-112))+r12]
+ movdqa xmm2,XMMWORD[((-96))+r12]
+ pand xmm4,XMMWORD[112+r10]
+ movdqa xmm3,XMMWORD[((-80))+r12]
+ pand xmm5,XMMWORD[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[160+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[((-64))+r12]
+ movdqa xmm5,XMMWORD[((-48))+r12]
+ movdqa xmm2,XMMWORD[((-32))+r12]
+ pand xmm4,XMMWORD[176+r10]
+ movdqa xmm3,XMMWORD[((-16))+r12]
+ pand xmm5,XMMWORD[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[r12]
+ movdqa xmm5,XMMWORD[16+r12]
+ movdqa xmm2,XMMWORD[32+r12]
+ pand xmm4,XMMWORD[240+r10]
+ movdqa xmm3,XMMWORD[48+r12]
+ pand xmm5,XMMWORD[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ por xmm0,xmm1
+
+ pshufd xmm1,xmm0,0x4e
+ por xmm0,xmm1
+ lea r12,[256+r12]
+DB 102,72,15,126,195
+
+ mov QWORD[((16+8))+rsp],r13
+ mov QWORD[((56+8))+rsp],rdi
+
+ mov r8,QWORD[r8]
+ mov rax,QWORD[rsi]
+ lea rsi,[r9*1+rsi]
+ neg r9
+
+ mov rbp,r8
+ mul rbx
+ mov r10,rax
+ mov rax,QWORD[rcx]
+
+ imul rbp,r10
+ lea r14,[((64+8))+rsp]
+ mov r11,rdx
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+r9*1+rsi]
+ adc rdx,0
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+rcx]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[16+r9*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ lea r15,[32+r9]
+ lea rcx,[32+rcx]
+ adc rdx,0
+ mov QWORD[r14],rdi
+ mov r13,rdx
+ jmp NEAR $L$1st4x
+
+ALIGN 32
+$L$1st4x:
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+rcx]
+ lea r14,[32+r14]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+r15*1+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-24))+r14],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[((-8))+rcx]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[r15*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-16))+r14],rdi
+ mov r13,rdx
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[rcx]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[8+r15*1+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-8))+r14],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+rcx]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[16+r15*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ lea rcx,[32+rcx]
+ adc rdx,0
+ mov QWORD[r14],rdi
+ mov r13,rdx
+
+ add r15,32
+ jnz NEAR $L$1st4x
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+rcx]
+ lea r14,[32+r14]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-24))+r14],r13
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[((-8))+rcx]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[r9*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-16))+r14],rdi
+ mov r13,rdx
+
+ lea rcx,[r9*1+rcx]
+
+ xor rdi,rdi
+ add r13,r10
+ adc rdi,0
+ mov QWORD[((-8))+r14],r13
+
+ jmp NEAR $L$outer4x
+
+ALIGN 32
+$L$outer4x:
+ lea rdx,[((16+128))+r14]
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD[((-128))+r12]
+ movdqa xmm1,XMMWORD[((-112))+r12]
+ movdqa xmm2,XMMWORD[((-96))+r12]
+ movdqa xmm3,XMMWORD[((-80))+r12]
+ pand xmm0,XMMWORD[((-128))+rdx]
+ pand xmm1,XMMWORD[((-112))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[((-96))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[((-80))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[((-64))+r12]
+ movdqa xmm1,XMMWORD[((-48))+r12]
+ movdqa xmm2,XMMWORD[((-32))+r12]
+ movdqa xmm3,XMMWORD[((-16))+r12]
+ pand xmm0,XMMWORD[((-64))+rdx]
+ pand xmm1,XMMWORD[((-48))+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[((-32))+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[((-16))+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[r12]
+ movdqa xmm1,XMMWORD[16+r12]
+ movdqa xmm2,XMMWORD[32+r12]
+ movdqa xmm3,XMMWORD[48+r12]
+ pand xmm0,XMMWORD[rdx]
+ pand xmm1,XMMWORD[16+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[32+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[48+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[64+r12]
+ movdqa xmm1,XMMWORD[80+r12]
+ movdqa xmm2,XMMWORD[96+r12]
+ movdqa xmm3,XMMWORD[112+r12]
+ pand xmm0,XMMWORD[64+rdx]
+ pand xmm1,XMMWORD[80+rdx]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[96+rdx]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[112+rdx]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+
+ pshufd xmm0,xmm4,0x4e
+ por xmm0,xmm4
+ lea r12,[256+r12]
+DB 102,72,15,126,195
+
+ mov r10,QWORD[r9*1+r14]
+ mov rbp,r8
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[rcx]
+ adc rdx,0
+
+ imul rbp,r10
+ mov r11,rdx
+ mov QWORD[r14],rdi
+
+ lea r14,[r9*1+r14]
+
+ mul rbp
+ add r10,rax
+ mov rax,QWORD[8+r9*1+rsi]
+ adc rdx,0
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+rcx]
+ adc rdx,0
+ add r11,QWORD[8+r14]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[16+r9*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ lea r15,[32+r9]
+ lea rcx,[32+rcx]
+ adc rdx,0
+ mov r13,rdx
+ jmp NEAR $L$inner4x
+
+ALIGN 32
+$L$inner4x:
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+rcx]
+ adc rdx,0
+ add r10,QWORD[16+r14]
+ lea r14,[32+r14]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+r15*1+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-32))+r14],rdi
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[((-8))+rcx]
+ adc rdx,0
+ add r11,QWORD[((-8))+r14]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[r15*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-24))+r14],r13
+ mov r13,rdx
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[rcx]
+ adc rdx,0
+ add r10,QWORD[r14]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[8+r15*1+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-16))+r14],rdi
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[8+rcx]
+ adc rdx,0
+ add r11,QWORD[8+r14]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[16+r15*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ lea rcx,[32+rcx]
+ adc rdx,0
+ mov QWORD[((-8))+r14],r13
+ mov r13,rdx
+
+ add r15,32
+ jnz NEAR $L$inner4x
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[((-16))+rcx]
+ adc rdx,0
+ add r10,QWORD[16+r14]
+ lea r14,[32+r14]
+ adc rdx,0
+ mov r11,rdx
+
+ mul rbp
+ add r13,rax
+ mov rax,QWORD[((-8))+rsi]
+ adc rdx,0
+ add r13,r10
+ adc rdx,0
+ mov QWORD[((-32))+r14],rdi
+ mov rdi,rdx
+
+ mul rbx
+ add r11,rax
+ mov rax,rbp
+ mov rbp,QWORD[((-8))+rcx]
+ adc rdx,0
+ add r11,QWORD[((-8))+r14]
+ adc rdx,0
+ mov r10,rdx
+
+ mul rbp
+ add rdi,rax
+ mov rax,QWORD[r9*1+rsi]
+ adc rdx,0
+ add rdi,r11
+ adc rdx,0
+ mov QWORD[((-24))+r14],r13
+ mov r13,rdx
+
+ mov QWORD[((-16))+r14],rdi
+ lea rcx,[r9*1+rcx]
+
+ xor rdi,rdi
+ add r13,r10
+ adc rdi,0
+ add r13,QWORD[r14]
+ adc rdi,0
+ mov QWORD[((-8))+r14],r13
+
+ cmp r12,QWORD[((16+8))+rsp]
+ jb NEAR $L$outer4x
+ xor rax,rax
+ sub rbp,r13
+ adc r15,r15
+ or rdi,r15
+ sub rax,rdi
+ lea rbx,[r9*1+r14]
+ mov r12,QWORD[rcx]
+ lea rbp,[rcx]
+ mov rcx,r9
+ sar rcx,3+2
+ mov rdi,QWORD[((56+8))+rsp]
+ dec r12
+ xor r10,r10
+ mov r13,QWORD[8+rbp]
+ mov r14,QWORD[16+rbp]
+ mov r15,QWORD[24+rbp]
+ jmp NEAR $L$sqr4x_sub_entry
+
+
+global bn_power5
+
+ALIGN 32
+bn_power5:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_power5:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ mov rax,rsp
+
+ lea r11,[OPENSSL_ia32cap_P]
+ mov r11d,DWORD[8+r11]
+ and r11d,0x80108
+ cmp r11d,0x80108
+ je NEAR $L$powerx5_enter
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$power5_prologue:
+
+ shl r9d,3
+ lea r10d,[r9*2+r9]
+ neg r9
+ mov r8,QWORD[r8]
+
+
+
+
+
+
+
+
+ lea r11,[((-320))+r9*2+rsp]
+ mov rbp,rsp
+ sub r11,rdi
+ and r11,4095
+ cmp r10,r11
+ jb NEAR $L$pwr_sp_alt
+ sub rbp,r11
+ lea rbp,[((-320))+r9*2+rbp]
+ jmp NEAR $L$pwr_sp_done
+
+ALIGN 32
+$L$pwr_sp_alt:
+ lea r10,[((4096-320))+r9*2]
+ lea rbp,[((-320))+r9*2+rbp]
+ sub r11,r10
+ mov r10,0
+ cmovc r11,r10
+ sub rbp,r11
+$L$pwr_sp_done:
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$pwr_page_walk
+ jmp NEAR $L$pwr_page_walk_done
+
+$L$pwr_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$pwr_page_walk
+$L$pwr_page_walk_done:
+
+ mov r10,r9
+ neg r9
+
+
+
+
+
+
+
+
+
+
+ mov QWORD[32+rsp],r8
+ mov QWORD[40+rsp],rax
+
+$L$power5_body:
+DB 102,72,15,110,207
+DB 102,72,15,110,209
+DB 102,73,15,110,218
+DB 102,72,15,110,226
+
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+
+DB 102,72,15,126,209
+DB 102,72,15,126,226
+ mov rdi,rsi
+ mov rax,QWORD[40+rsp]
+ lea r8,[32+rsp]
+
+ call mul4x_internal
+
+ mov rsi,QWORD[40+rsp]
+
+ mov rax,1
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$power5_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_power5:
+
+global bn_sqr8x_internal
+
+
+ALIGN 32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lea rbp,[32+r10]
+ lea rsi,[r9*1+rsi]
+
+ mov rcx,r9
+
+
+ mov r14,QWORD[((-32))+rbp*1+rsi]
+ lea rdi,[((48+8))+r9*2+rsp]
+ mov rax,QWORD[((-24))+rbp*1+rsi]
+ lea rdi,[((-32))+rbp*1+rdi]
+ mov rbx,QWORD[((-16))+rbp*1+rsi]
+ mov r15,rax
+
+ mul r14
+ mov r10,rax
+ mov rax,rbx
+ mov r11,rdx
+ mov QWORD[((-24))+rbp*1+rdi],r10
+
+ mul r14
+ add r11,rax
+ mov rax,rbx
+ adc rdx,0
+ mov QWORD[((-16))+rbp*1+rdi],r11
+ mov r10,rdx
+
+
+ mov rbx,QWORD[((-8))+rbp*1+rsi]
+ mul r15
+ mov r12,rax
+ mov rax,rbx
+ mov r13,rdx
+
+ lea rcx,[rbp]
+ mul r14
+ add r10,rax
+ mov rax,rbx
+ mov r11,rdx
+ adc r11,0
+ add r10,r12
+ adc r11,0
+ mov QWORD[((-8))+rcx*1+rdi],r10
+ jmp NEAR $L$sqr4x_1st
+
+ALIGN 32
+$L$sqr4x_1st:
+ mov rbx,QWORD[rcx*1+rsi]
+ mul r15
+ add r13,rax
+ mov rax,rbx
+ mov r12,rdx
+ adc r12,0
+
+ mul r14
+ add r11,rax
+ mov rax,rbx
+ mov rbx,QWORD[8+rcx*1+rsi]
+ mov r10,rdx
+ adc r10,0
+ add r11,r13
+ adc r10,0
+
+
+ mul r15
+ add r12,rax
+ mov rax,rbx
+ mov QWORD[rcx*1+rdi],r11
+ mov r13,rdx
+ adc r13,0
+
+ mul r14
+ add r10,rax
+ mov rax,rbx
+ mov rbx,QWORD[16+rcx*1+rsi]
+ mov r11,rdx
+ adc r11,0
+ add r10,r12
+ adc r11,0
+
+ mul r15
+ add r13,rax
+ mov rax,rbx
+ mov QWORD[8+rcx*1+rdi],r10
+ mov r12,rdx
+ adc r12,0
+
+ mul r14
+ add r11,rax
+ mov rax,rbx
+ mov rbx,QWORD[24+rcx*1+rsi]
+ mov r10,rdx
+ adc r10,0
+ add r11,r13
+ adc r10,0
+
+
+ mul r15
+ add r12,rax
+ mov rax,rbx
+ mov QWORD[16+rcx*1+rdi],r11
+ mov r13,rdx
+ adc r13,0
+ lea rcx,[32+rcx]
+
+ mul r14
+ add r10,rax
+ mov rax,rbx
+ mov r11,rdx
+ adc r11,0
+ add r10,r12
+ adc r11,0
+ mov QWORD[((-8))+rcx*1+rdi],r10
+
+ cmp rcx,0
+ jne NEAR $L$sqr4x_1st
+
+ mul r15
+ add r13,rax
+ lea rbp,[16+rbp]
+ adc rdx,0
+ add r13,r11
+ adc rdx,0
+
+ mov QWORD[rdi],r13
+ mov r12,rdx
+ mov QWORD[8+rdi],rdx
+ jmp NEAR $L$sqr4x_outer
+
+ALIGN 32
+$L$sqr4x_outer:
+ mov r14,QWORD[((-32))+rbp*1+rsi]
+ lea rdi,[((48+8))+r9*2+rsp]
+ mov rax,QWORD[((-24))+rbp*1+rsi]
+ lea rdi,[((-32))+rbp*1+rdi]
+ mov rbx,QWORD[((-16))+rbp*1+rsi]
+ mov r15,rax
+
+ mul r14
+ mov r10,QWORD[((-24))+rbp*1+rdi]
+ add r10,rax
+ mov rax,rbx
+ adc rdx,0
+ mov QWORD[((-24))+rbp*1+rdi],r10
+ mov r11,rdx
+
+ mul r14
+ add r11,rax
+ mov rax,rbx
+ adc rdx,0
+ add r11,QWORD[((-16))+rbp*1+rdi]
+ mov r10,rdx
+ adc r10,0
+ mov QWORD[((-16))+rbp*1+rdi],r11
+
+ xor r12,r12
+
+ mov rbx,QWORD[((-8))+rbp*1+rsi]
+ mul r15
+ add r12,rax
+ mov rax,rbx
+ adc rdx,0
+ add r12,QWORD[((-8))+rbp*1+rdi]
+ mov r13,rdx
+ adc r13,0
+
+ mul r14
+ add r10,rax
+ mov rax,rbx
+ adc rdx,0
+ add r10,r12
+ mov r11,rdx
+ adc r11,0
+ mov QWORD[((-8))+rbp*1+rdi],r10
+
+ lea rcx,[rbp]
+ jmp NEAR $L$sqr4x_inner
+
+ALIGN 32
+$L$sqr4x_inner:
+ mov rbx,QWORD[rcx*1+rsi]
+ mul r15
+ add r13,rax
+ mov rax,rbx
+ mov r12,rdx
+ adc r12,0
+ add r13,QWORD[rcx*1+rdi]
+ adc r12,0
+
+ DB 0x67
+ mul r14
+ add r11,rax
+ mov rax,rbx
+ mov rbx,QWORD[8+rcx*1+rsi]
+ mov r10,rdx
+ adc r10,0
+ add r11,r13
+ adc r10,0
+
+ mul r15
+ add r12,rax
+ mov QWORD[rcx*1+rdi],r11
+ mov rax,rbx
+ mov r13,rdx
+ adc r13,0
+ add r12,QWORD[8+rcx*1+rdi]
+ lea rcx,[16+rcx]
+ adc r13,0
+
+ mul r14
+ add r10,rax
+ mov rax,rbx
+ adc rdx,0
+ add r10,r12
+ mov r11,rdx
+ adc r11,0
+ mov QWORD[((-8))+rcx*1+rdi],r10
+
+ cmp rcx,0
+ jne NEAR $L$sqr4x_inner
+
+ DB 0x67
+ mul r15
+ add r13,rax
+ adc rdx,0
+ add r13,r11
+ adc rdx,0
+
+ mov QWORD[rdi],r13
+ mov r12,rdx
+ mov QWORD[8+rdi],rdx
+
+ add rbp,16
+ jnz NEAR $L$sqr4x_outer
+
+
+ mov r14,QWORD[((-32))+rsi]
+ lea rdi,[((48+8))+r9*2+rsp]
+ mov rax,QWORD[((-24))+rsi]
+ lea rdi,[((-32))+rbp*1+rdi]
+ mov rbx,QWORD[((-16))+rsi]
+ mov r15,rax
+
+ mul r14
+ add r10,rax
+ mov rax,rbx
+ mov r11,rdx
+ adc r11,0
+
+ mul r14
+ add r11,rax
+ mov rax,rbx
+ mov QWORD[((-24))+rdi],r10
+ mov r10,rdx
+ adc r10,0
+ add r11,r13
+ mov rbx,QWORD[((-8))+rsi]
+ adc r10,0
+
+ mul r15
+ add r12,rax
+ mov rax,rbx
+ mov QWORD[((-16))+rdi],r11
+ mov r13,rdx
+ adc r13,0
+
+ mul r14
+ add r10,rax
+ mov rax,rbx
+ mov r11,rdx
+ adc r11,0
+ add r10,r12
+ adc r11,0
+ mov QWORD[((-8))+rdi],r10
+
+ mul r15
+ add r13,rax
+ mov rax,QWORD[((-16))+rsi]
+ adc rdx,0
+ add r13,r11
+ adc rdx,0
+
+ mov QWORD[rdi],r13
+ mov r12,rdx
+ mov QWORD[8+rdi],rdx
+
+ mul rbx
+ add rbp,16
+ xor r14,r14
+ sub rbp,r9
+ xor r15,r15
+
+ add rax,r12
+ adc rdx,0
+ mov QWORD[8+rdi],rax
+ mov QWORD[16+rdi],rdx
+ mov QWORD[24+rdi],r15
+
+ mov rax,QWORD[((-16))+rbp*1+rsi]
+ lea rdi,[((48+8))+rsp]
+ xor r10,r10
+ mov r11,QWORD[8+rdi]
+
+ lea r12,[r10*2+r14]
+ shr r10,63
+ lea r13,[r11*2+rcx]
+ shr r11,63
+ or r13,r10
+ mov r10,QWORD[16+rdi]
+ mov r14,r11
+ mul rax
+ neg r15
+ mov r11,QWORD[24+rdi]
+ adc r12,rax
+ mov rax,QWORD[((-8))+rbp*1+rsi]
+ mov QWORD[rdi],r12
+ adc r13,rdx
+
+ lea rbx,[r10*2+r14]
+ mov QWORD[8+rdi],r13
+ sbb r15,r15
+ shr r10,63
+ lea r8,[r11*2+rcx]
+ shr r11,63
+ or r8,r10
+ mov r10,QWORD[32+rdi]
+ mov r14,r11
+ mul rax
+ neg r15
+ mov r11,QWORD[40+rdi]
+ adc rbx,rax
+ mov rax,QWORD[rbp*1+rsi]
+ mov QWORD[16+rdi],rbx
+ adc r8,rdx
+ lea rbp,[16+rbp]
+ mov QWORD[24+rdi],r8
+ sbb r15,r15
+ lea rdi,[64+rdi]
+ jmp NEAR $L$sqr4x_shift_n_add
+
+ALIGN 32
+$L$sqr4x_shift_n_add:
+ lea r12,[r10*2+r14]
+ shr r10,63
+ lea r13,[r11*2+rcx]
+ shr r11,63
+ or r13,r10
+ mov r10,QWORD[((-16))+rdi]
+ mov r14,r11
+ mul rax
+ neg r15
+ mov r11,QWORD[((-8))+rdi]
+ adc r12,rax
+ mov rax,QWORD[((-8))+rbp*1+rsi]
+ mov QWORD[((-32))+rdi],r12
+ adc r13,rdx
+
+ lea rbx,[r10*2+r14]
+ mov QWORD[((-24))+rdi],r13
+ sbb r15,r15
+ shr r10,63
+ lea r8,[r11*2+rcx]
+ shr r11,63
+ or r8,r10
+ mov r10,QWORD[rdi]
+ mov r14,r11
+ mul rax
+ neg r15
+ mov r11,QWORD[8+rdi]
+ adc rbx,rax
+ mov rax,QWORD[rbp*1+rsi]
+ mov QWORD[((-16))+rdi],rbx
+ adc r8,rdx
+
+ lea r12,[r10*2+r14]
+ mov QWORD[((-8))+rdi],r8
+ sbb r15,r15
+ shr r10,63
+ lea r13,[r11*2+rcx]
+ shr r11,63
+ or r13,r10
+ mov r10,QWORD[16+rdi]
+ mov r14,r11
+ mul rax
+ neg r15
+ mov r11,QWORD[24+rdi]
+ adc r12,rax
+ mov rax,QWORD[8+rbp*1+rsi]
+ mov QWORD[rdi],r12
+ adc r13,rdx
+
+ lea rbx,[r10*2+r14]
+ mov QWORD[8+rdi],r13
+ sbb r15,r15
+ shr r10,63
+ lea r8,[r11*2+rcx]
+ shr r11,63
+ or r8,r10
+ mov r10,QWORD[32+rdi]
+ mov r14,r11
+ mul rax
+ neg r15
+ mov r11,QWORD[40+rdi]
+ adc rbx,rax
+ mov rax,QWORD[16+rbp*1+rsi]
+ mov QWORD[16+rdi],rbx
+ adc r8,rdx
+ mov QWORD[24+rdi],r8
+ sbb r15,r15
+ lea rdi,[64+rdi]
+ add rbp,32
+ jnz NEAR $L$sqr4x_shift_n_add
+
+ lea r12,[r10*2+r14]
+ DB 0x67
+ shr r10,63
+ lea r13,[r11*2+rcx]
+ shr r11,63
+ or r13,r10
+ mov r10,QWORD[((-16))+rdi]
+ mov r14,r11
+ mul rax
+ neg r15
+ mov r11,QWORD[((-8))+rdi]
+ adc r12,rax
+ mov rax,QWORD[((-8))+rsi]
+ mov QWORD[((-32))+rdi],r12
+ adc r13,rdx
+
+ lea rbx,[r10*2+r14]
+ mov QWORD[((-24))+rdi],r13
+ sbb r15,r15
+ shr r10,63
+ lea r8,[r11*2+rcx]
+ shr r11,63
+ or r8,r10
+ mul rax
+ neg r15
+ adc rbx,rax
+ adc r8,rdx
+ mov QWORD[((-16))+rdi],rbx
+ mov QWORD[((-8))+rdi],r8
+DB 102,72,15,126,213
+__bn_sqr8x_reduction:
+ xor rax,rax
+ lea rcx,[rbp*1+r9]
+ lea rdx,[((48+8))+r9*2+rsp]
+ mov QWORD[((0+8))+rsp],rcx
+ lea rdi,[((48+8))+r9*1+rsp]
+ mov QWORD[((8+8))+rsp],rdx
+ neg r9
+ jmp NEAR $L$8x_reduction_loop
+
+ALIGN 32
+$L$8x_reduction_loop:
+ lea rdi,[r9*1+rdi]
+ DB 0x66
+ mov rbx,QWORD[rdi]
+ mov r9,QWORD[8+rdi]
+ mov r10,QWORD[16+rdi]
+ mov r11,QWORD[24+rdi]
+ mov r12,QWORD[32+rdi]
+ mov r13,QWORD[40+rdi]
+ mov r14,QWORD[48+rdi]
+ mov r15,QWORD[56+rdi]
+ mov QWORD[rdx],rax
+ lea rdi,[64+rdi]
+
+ DB 0x67
+ mov r8,rbx
+ imul rbx,QWORD[((32+8))+rsp]
+ mov rax,QWORD[rbp]
+ mov ecx,8
+ jmp NEAR $L$8x_reduce
+
+ALIGN 32
+$L$8x_reduce:
+ mul rbx
+ mov rax,QWORD[8+rbp]
+ neg r8
+ mov r8,rdx
+ adc r8,0
+
+ mul rbx
+ add r9,rax
+ mov rax,QWORD[16+rbp]
+ adc rdx,0
+ add r8,r9
+ mov QWORD[((48-8+8))+rcx*8+rsp],rbx
+ mov r9,rdx
+ adc r9,0
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[24+rbp]
+ adc rdx,0
+ add r9,r10
+ mov rsi,QWORD[((32+8))+rsp]
+ mov r10,rdx
+ adc r10,0
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[32+rbp]
+ adc rdx,0
+ imul rsi,r8
+ add r10,r11
+ mov r11,rdx
+ adc r11,0
+
+ mul rbx
+ add r12,rax
+ mov rax,QWORD[40+rbp]
+ adc rdx,0
+ add r11,r12
+ mov r12,rdx
+ adc r12,0
+
+ mul rbx
+ add r13,rax
+ mov rax,QWORD[48+rbp]
+ adc rdx,0
+ add r12,r13
+ mov r13,rdx
+ adc r13,0
+
+ mul rbx
+ add r14,rax
+ mov rax,QWORD[56+rbp]
+ adc rdx,0
+ add r13,r14
+ mov r14,rdx
+ adc r14,0
+
+ mul rbx
+ mov rbx,rsi
+ add r15,rax
+ mov rax,QWORD[rbp]
+ adc rdx,0
+ add r14,r15
+ mov r15,rdx
+ adc r15,0
+
+ dec ecx
+ jnz NEAR $L$8x_reduce
+
+ lea rbp,[64+rbp]
+ xor rax,rax
+ mov rdx,QWORD[((8+8))+rsp]
+ cmp rbp,QWORD[((0+8))+rsp]
+ jae NEAR $L$8x_no_tail
+
+ DB 0x66
+ add r8,QWORD[rdi]
+ adc r9,QWORD[8+rdi]
+ adc r10,QWORD[16+rdi]
+ adc r11,QWORD[24+rdi]
+ adc r12,QWORD[32+rdi]
+ adc r13,QWORD[40+rdi]
+ adc r14,QWORD[48+rdi]
+ adc r15,QWORD[56+rdi]
+ sbb rsi,rsi
+
+ mov rbx,QWORD[((48+56+8))+rsp]
+ mov ecx,8
+ mov rax,QWORD[rbp]
+ jmp NEAR $L$8x_tail
+
+ALIGN 32
+$L$8x_tail:
+ mul rbx
+ add r8,rax
+ mov rax,QWORD[8+rbp]
+ mov QWORD[rdi],r8
+ mov r8,rdx
+ adc r8,0
+
+ mul rbx
+ add r9,rax
+ mov rax,QWORD[16+rbp]
+ adc rdx,0
+ add r8,r9
+ lea rdi,[8+rdi]
+ mov r9,rdx
+ adc r9,0
+
+ mul rbx
+ add r10,rax
+ mov rax,QWORD[24+rbp]
+ adc rdx,0
+ add r9,r10
+ mov r10,rdx
+ adc r10,0
+
+ mul rbx
+ add r11,rax
+ mov rax,QWORD[32+rbp]
+ adc rdx,0
+ add r10,r11
+ mov r11,rdx
+ adc r11,0
+
+ mul rbx
+ add r12,rax
+ mov rax,QWORD[40+rbp]
+ adc rdx,0
+ add r11,r12
+ mov r12,rdx
+ adc r12,0
+
+ mul rbx
+ add r13,rax
+ mov rax,QWORD[48+rbp]
+ adc rdx,0
+ add r12,r13
+ mov r13,rdx
+ adc r13,0
+
+ mul rbx
+ add r14,rax
+ mov rax,QWORD[56+rbp]
+ adc rdx,0
+ add r13,r14
+ mov r14,rdx
+ adc r14,0
+
+ mul rbx
+ mov rbx,QWORD[((48-16+8))+rcx*8+rsp]
+ add r15,rax
+ adc rdx,0
+ add r14,r15
+ mov rax,QWORD[rbp]
+ mov r15,rdx
+ adc r15,0
+
+ dec ecx
+ jnz NEAR $L$8x_tail
+
+ lea rbp,[64+rbp]
+ mov rdx,QWORD[((8+8))+rsp]
+ cmp rbp,QWORD[((0+8))+rsp]
+ jae NEAR $L$8x_tail_done
+
+ mov rbx,QWORD[((48+56+8))+rsp]
+ neg rsi
+ mov rax,QWORD[rbp]
+ adc r8,QWORD[rdi]
+ adc r9,QWORD[8+rdi]
+ adc r10,QWORD[16+rdi]
+ adc r11,QWORD[24+rdi]
+ adc r12,QWORD[32+rdi]
+ adc r13,QWORD[40+rdi]
+ adc r14,QWORD[48+rdi]
+ adc r15,QWORD[56+rdi]
+ sbb rsi,rsi
+
+ mov ecx,8
+ jmp NEAR $L$8x_tail
+
+ALIGN 32
+$L$8x_tail_done:
+ xor rax,rax
+ add r8,QWORD[rdx]
+ adc r9,0
+ adc r10,0
+ adc r11,0
+ adc r12,0
+ adc r13,0
+ adc r14,0
+ adc r15,0
+ adc rax,0
+
+ neg rsi
+$L$8x_no_tail:
+ adc r8,QWORD[rdi]
+ adc r9,QWORD[8+rdi]
+ adc r10,QWORD[16+rdi]
+ adc r11,QWORD[24+rdi]
+ adc r12,QWORD[32+rdi]
+ adc r13,QWORD[40+rdi]
+ adc r14,QWORD[48+rdi]
+ adc r15,QWORD[56+rdi]
+ adc rax,0
+ mov rcx,QWORD[((-8))+rbp]
+ xor rsi,rsi
+
+DB 102,72,15,126,213
+
+ mov QWORD[rdi],r8
+ mov QWORD[8+rdi],r9
+DB 102,73,15,126,217
+ mov QWORD[16+rdi],r10
+ mov QWORD[24+rdi],r11
+ mov QWORD[32+rdi],r12
+ mov QWORD[40+rdi],r13
+ mov QWORD[48+rdi],r14
+ mov QWORD[56+rdi],r15
+ lea rdi,[64+rdi]
+
+ cmp rdi,rdx
+ jb NEAR $L$8x_reduction_loop
+ ret
+
+
+
+ALIGN 32
+__bn_post4x_internal:
+
+ mov r12,QWORD[rbp]
+ lea rbx,[r9*1+rdi]
+ mov rcx,r9
+DB 102,72,15,126,207
+ neg rax
+DB 102,72,15,126,206
+ sar rcx,3+2
+ dec r12
+ xor r10,r10
+ mov r13,QWORD[8+rbp]
+ mov r14,QWORD[16+rbp]
+ mov r15,QWORD[24+rbp]
+ jmp NEAR $L$sqr4x_sub_entry
+
+ALIGN 16
+$L$sqr4x_sub:
+ mov r12,QWORD[rbp]
+ mov r13,QWORD[8+rbp]
+ mov r14,QWORD[16+rbp]
+ mov r15,QWORD[24+rbp]
+$L$sqr4x_sub_entry:
+ lea rbp,[32+rbp]
+ not r12
+ not r13
+ not r14
+ not r15
+ and r12,rax
+ and r13,rax
+ and r14,rax
+ and r15,rax
+
+ neg r10
+ adc r12,QWORD[rbx]
+ adc r13,QWORD[8+rbx]
+ adc r14,QWORD[16+rbx]
+ adc r15,QWORD[24+rbx]
+ mov QWORD[rdi],r12
+ lea rbx,[32+rbx]
+ mov QWORD[8+rdi],r13
+ sbb r10,r10
+ mov QWORD[16+rdi],r14
+ mov QWORD[24+rdi],r15
+ lea rdi,[32+rdi]
+
+ inc rcx
+ jnz NEAR $L$sqr4x_sub
+
+ mov r10,r9
+ neg r9
+ ret
+
+
+
+ALIGN 32
+bn_mulx4x_mont_gather5:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_mulx4x_mont_gather5:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+ mov rax,rsp
+
+$L$mulx4x_enter:
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$mulx4x_prologue:
+
+ shl r9d,3
+ lea r10,[r9*2+r9]
+ neg r9
+ mov r8,QWORD[r8]
+
+
+
+
+
+
+
+
+
+
+ lea r11,[((-320))+r9*2+rsp]
+ mov rbp,rsp
+ sub r11,rdi
+ and r11,4095
+ cmp r10,r11
+ jb NEAR $L$mulx4xsp_alt
+ sub rbp,r11
+ lea rbp,[((-320))+r9*2+rbp]
+ jmp NEAR $L$mulx4xsp_done
+
+$L$mulx4xsp_alt:
+ lea r10,[((4096-320))+r9*2]
+ lea rbp,[((-320))+r9*2+rbp]
+ sub r11,r10
+ mov r10,0
+ cmovc r11,r10
+ sub rbp,r11
+$L$mulx4xsp_done:
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mulx4x_page_walk
+ jmp NEAR $L$mulx4x_page_walk_done
+
+$L$mulx4x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mulx4x_page_walk
+$L$mulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mov QWORD[32+rsp],r8
+ mov QWORD[40+rsp],rax
+
+$L$mulx4x_body:
+ call mulx4x_internal
+
+ mov rsi,QWORD[40+rsp]
+
+ mov rax,1
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$mulx4x_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_mulx4x_mont_gather5:
+
+
+ALIGN 32
+mulx4x_internal:
+
+ mov QWORD[8+rsp],r9
+ mov r10,r9
+ neg r9
+ shl r9,5
+ neg r10
+ lea r13,[128+r9*1+rdx]
+ shr r9,5+5
+ movd xmm5,DWORD[56+rax]
+ sub r9,1
+ lea rax,[$L$inc]
+ mov QWORD[((16+8))+rsp],r13
+ mov QWORD[((24+8))+rsp],r9
+ mov QWORD[((56+8))+rsp],rdi
+ movdqa xmm0,XMMWORD[rax]
+ movdqa xmm1,XMMWORD[16+rax]
+ lea r10,[((88-112))+r10*1+rsp]
+ lea rdi,[128+rdx]
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ DB 0x67
+ movdqa xmm2,xmm1
+ DB 0x67
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[112+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[128+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[144+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[160+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[176+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[192+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[208+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[224+r10],xmm3
+ movdqa xmm3,xmm4
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[240+r10],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[256+r10],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[272+r10],xmm2
+ movdqa xmm2,xmm4
+
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[288+r10],xmm3
+ movdqa xmm3,xmm4
+ DB 0x67
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[304+r10],xmm0
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[320+r10],xmm1
+
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[336+r10],xmm2
+
+ pand xmm0,XMMWORD[64+rdi]
+ pand xmm1,XMMWORD[80+rdi]
+ pand xmm2,XMMWORD[96+rdi]
+ movdqa XMMWORD[352+r10],xmm3
+ pand xmm3,XMMWORD[112+rdi]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[((-128))+rdi]
+ movdqa xmm5,XMMWORD[((-112))+rdi]
+ movdqa xmm2,XMMWORD[((-96))+rdi]
+ pand xmm4,XMMWORD[112+r10]
+ movdqa xmm3,XMMWORD[((-80))+rdi]
+ pand xmm5,XMMWORD[128+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[144+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[160+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[((-64))+rdi]
+ movdqa xmm5,XMMWORD[((-48))+rdi]
+ movdqa xmm2,XMMWORD[((-32))+rdi]
+ pand xmm4,XMMWORD[176+r10]
+ movdqa xmm3,XMMWORD[((-16))+rdi]
+ pand xmm5,XMMWORD[192+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[208+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[224+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ movdqa xmm4,XMMWORD[rdi]
+ movdqa xmm5,XMMWORD[16+rdi]
+ movdqa xmm2,XMMWORD[32+rdi]
+ pand xmm4,XMMWORD[240+r10]
+ movdqa xmm3,XMMWORD[48+rdi]
+ pand xmm5,XMMWORD[256+r10]
+ por xmm0,xmm4
+ pand xmm2,XMMWORD[272+r10]
+ por xmm1,xmm5
+ pand xmm3,XMMWORD[288+r10]
+ por xmm0,xmm2
+ por xmm1,xmm3
+ pxor xmm0,xmm1
+
+ pshufd xmm1,xmm0,0x4e
+ por xmm0,xmm1
+ lea rdi,[256+rdi]
+DB 102,72,15,126,194
+ lea rbx,[((64+32+8))+rsp]
+
+ mov r9,rdx
+ mulx rax,r8,QWORD[rsi]
+ mulx r12,r11,QWORD[8+rsi]
+ add r11,rax
+ mulx r13,rax,QWORD[16+rsi]
+ adc r12,rax
+ adc r13,0
+ mulx r14,rax,QWORD[24+rsi]
+
+ mov r15,r8
+ imul r8,QWORD[((32+8))+rsp]
+ xor rbp,rbp
+ mov rdx,r8
+
+ mov QWORD[((8+8))+rsp],rdi
+
+ lea rsi,[32+rsi]
+ adcx r13,rax
+ adcx r14,rbp
+
+ mulx r10,rax,QWORD[rcx]
+ adcx r15,rax
+ adox r10,r11
+ mulx r11,rax,QWORD[8+rcx]
+ adcx r10,rax
+ adox r11,r12
+ mulx r12,rax,QWORD[16+rcx]
+ mov rdi,QWORD[((24+8))+rsp]
+ mov QWORD[((-32))+rbx],r10
+ adcx r11,rax
+ adox r12,r13
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ mov QWORD[((-24))+rbx],r11
+ adcx r12,rax
+ adox r15,rbp
+ lea rcx,[32+rcx]
+ mov QWORD[((-16))+rbx],r12
+ jmp NEAR $L$mulx4x_1st
+
+ALIGN 32
+$L$mulx4x_1st:
+ adcx r15,rbp
+ mulx rax,r10,QWORD[rsi]
+ adcx r10,r14
+ mulx r14,r11,QWORD[8+rsi]
+ adcx r11,rax
+ mulx rax,r12,QWORD[16+rsi]
+ adcx r12,r14
+ mulx r14,r13,QWORD[24+rsi]
+ DB 0x67,0x67
+ mov rdx,r8
+ adcx r13,rax
+ adcx r14,rbp
+ lea rsi,[32+rsi]
+ lea rbx,[32+rbx]
+
+ adox r10,r15
+ mulx r15,rax,QWORD[rcx]
+ adcx r10,rax
+ adox r11,r15
+ mulx r15,rax,QWORD[8+rcx]
+ adcx r11,rax
+ adox r12,r15
+ mulx r15,rax,QWORD[16+rcx]
+ mov QWORD[((-40))+rbx],r10
+ adcx r12,rax
+ mov QWORD[((-32))+rbx],r11
+ adox r13,r15
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ mov QWORD[((-24))+rbx],r12
+ adcx r13,rax
+ adox r15,rbp
+ lea rcx,[32+rcx]
+ mov QWORD[((-16))+rbx],r13
+
+ dec rdi
+ jnz NEAR $L$mulx4x_1st
+
+ mov rax,QWORD[8+rsp]
+ adc r15,rbp
+ lea rsi,[rax*1+rsi]
+ add r14,r15
+ mov rdi,QWORD[((8+8))+rsp]
+ adc rbp,rbp
+ mov QWORD[((-8))+rbx],r14
+ jmp NEAR $L$mulx4x_outer
+
+ALIGN 32
+$L$mulx4x_outer:
+ lea r10,[((16-256))+rbx]
+ pxor xmm4,xmm4
+ DB 0x67,0x67
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD[((-128))+rdi]
+ movdqa xmm1,XMMWORD[((-112))+rdi]
+ movdqa xmm2,XMMWORD[((-96))+rdi]
+ pand xmm0,XMMWORD[256+r10]
+ movdqa xmm3,XMMWORD[((-80))+rdi]
+ pand xmm1,XMMWORD[272+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[288+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[304+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[((-64))+rdi]
+ movdqa xmm1,XMMWORD[((-48))+rdi]
+ movdqa xmm2,XMMWORD[((-32))+rdi]
+ pand xmm0,XMMWORD[320+r10]
+ movdqa xmm3,XMMWORD[((-16))+rdi]
+ pand xmm1,XMMWORD[336+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[352+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[368+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[rdi]
+ movdqa xmm1,XMMWORD[16+rdi]
+ movdqa xmm2,XMMWORD[32+rdi]
+ pand xmm0,XMMWORD[384+r10]
+ movdqa xmm3,XMMWORD[48+rdi]
+ pand xmm1,XMMWORD[400+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[416+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[432+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[64+rdi]
+ movdqa xmm1,XMMWORD[80+rdi]
+ movdqa xmm2,XMMWORD[96+rdi]
+ pand xmm0,XMMWORD[448+r10]
+ movdqa xmm3,XMMWORD[112+rdi]
+ pand xmm1,XMMWORD[464+r10]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[480+r10]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[496+r10]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+
+ pshufd xmm0,xmm4,0x4e
+ por xmm0,xmm4
+ lea rdi,[256+rdi]
+DB 102,72,15,126,194
+
+ mov QWORD[rbx],rbp
+ lea rbx,[32+rax*1+rbx]
+ mulx r11,r8,QWORD[rsi]
+ xor rbp,rbp
+ mov r9,rdx
+ mulx r12,r14,QWORD[8+rsi]
+ adox r8,QWORD[((-32))+rbx]
+ adcx r11,r14
+ mulx r13,r15,QWORD[16+rsi]
+ adox r11,QWORD[((-24))+rbx]
+ adcx r12,r15
+ mulx r14,rdx,QWORD[24+rsi]
+ adox r12,QWORD[((-16))+rbx]
+ adcx r13,rdx
+ lea rcx,[rax*1+rcx]
+ lea rsi,[32+rsi]
+ adox r13,QWORD[((-8))+rbx]
+ adcx r14,rbp
+ adox r14,rbp
+
+ mov r15,r8
+ imul r8,QWORD[((32+8))+rsp]
+
+ mov rdx,r8
+ xor rbp,rbp
+ mov QWORD[((8+8))+rsp],rdi
+
+ mulx r10,rax,QWORD[rcx]
+ adcx r15,rax
+ adox r10,r11
+ mulx r11,rax,QWORD[8+rcx]
+ adcx r10,rax
+ adox r11,r12
+ mulx r12,rax,QWORD[16+rcx]
+ adcx r11,rax
+ adox r12,r13
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ mov rdi,QWORD[((24+8))+rsp]
+ mov QWORD[((-32))+rbx],r10
+ adcx r12,rax
+ mov QWORD[((-24))+rbx],r11
+ adox r15,rbp
+ mov QWORD[((-16))+rbx],r12
+ lea rcx,[32+rcx]
+ jmp NEAR $L$mulx4x_inner
+
+ALIGN 32
+$L$mulx4x_inner:
+ mulx rax,r10,QWORD[rsi]
+ adcx r15,rbp
+ adox r10,r14
+ mulx r14,r11,QWORD[8+rsi]
+ adcx r10,QWORD[rbx]
+ adox r11,rax
+ mulx rax,r12,QWORD[16+rsi]
+ adcx r11,QWORD[8+rbx]
+ adox r12,r14
+ mulx r14,r13,QWORD[24+rsi]
+ mov rdx,r8
+ adcx r12,QWORD[16+rbx]
+ adox r13,rax
+ adcx r13,QWORD[24+rbx]
+ adox r14,rbp
+ lea rsi,[32+rsi]
+ lea rbx,[32+rbx]
+ adcx r14,rbp
+
+ adox r10,r15
+ mulx r15,rax,QWORD[rcx]
+ adcx r10,rax
+ adox r11,r15
+ mulx r15,rax,QWORD[8+rcx]
+ adcx r11,rax
+ adox r12,r15
+ mulx r15,rax,QWORD[16+rcx]
+ mov QWORD[((-40))+rbx],r10
+ adcx r12,rax
+ adox r13,r15
+ mov QWORD[((-32))+rbx],r11
+ mulx r15,rax,QWORD[24+rcx]
+ mov rdx,r9
+ lea rcx,[32+rcx]
+ mov QWORD[((-24))+rbx],r12
+ adcx r13,rax
+ adox r15,rbp
+ mov QWORD[((-16))+rbx],r13
+
+ dec rdi
+ jnz NEAR $L$mulx4x_inner
+
+ mov rax,QWORD[((0+8))+rsp]
+ adc r15,rbp
+ sub rdi,QWORD[rbx]
+ mov rdi,QWORD[((8+8))+rsp]
+ mov r10,QWORD[((16+8))+rsp]
+ adc r14,r15
+ lea rsi,[rax*1+rsi]
+ adc rbp,rbp
+ mov QWORD[((-8))+rbx],r14
+
+ cmp rdi,r10
+ jb NEAR $L$mulx4x_outer
+
+ mov r10,QWORD[((-8))+rcx]
+ mov r8,rbp
+ mov r12,QWORD[rax*1+rcx]
+ lea rbp,[rax*1+rcx]
+ mov rcx,rax
+ lea rdi,[rax*1+rbx]
+ xor eax,eax
+ xor r15,r15
+ sub r10,r14
+ adc r15,r15
+ or r8,r15
+ sar rcx,3+2
+ sub rax,r8
+ mov rdx,QWORD[((56+8))+rsp]
+ dec r12
+ mov r13,QWORD[8+rbp]
+ xor r8,r8
+ mov r14,QWORD[16+rbp]
+ mov r15,QWORD[24+rbp]
+ jmp NEAR $L$sqrx4x_sub_entry
+
+
+
+ALIGN 32
+bn_powerx5:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_bn_powerx5:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+ mov rax,rsp
+
+$L$powerx5_enter:
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+$L$powerx5_prologue:
+
+ shl r9d,3
+ lea r10,[r9*2+r9]
+ neg r9
+ mov r8,QWORD[r8]
+
+
+
+
+
+
+
+
+ lea r11,[((-320))+r9*2+rsp]
+ mov rbp,rsp
+ sub r11,rdi
+ and r11,4095
+ cmp r10,r11
+ jb NEAR $L$pwrx_sp_alt
+ sub rbp,r11
+ lea rbp,[((-320))+r9*2+rbp]
+ jmp NEAR $L$pwrx_sp_done
+
+ALIGN 32
+$L$pwrx_sp_alt:
+ lea r10,[((4096-320))+r9*2]
+ lea rbp,[((-320))+r9*2+rbp]
+ sub r11,r10
+ mov r10,0
+ cmovc r11,r10
+ sub rbp,r11
+$L$pwrx_sp_done:
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$pwrx_page_walk
+ jmp NEAR $L$pwrx_page_walk_done
+
+$L$pwrx_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$pwrx_page_walk
+$L$pwrx_page_walk_done:
+
+ mov r10,r9
+ neg r9
+
+
+
+
+
+
+
+
+
+
+
+
+ pxor xmm0,xmm0
+DB 102,72,15,110,207
+DB 102,72,15,110,209
+DB 102,73,15,110,218
+DB 102,72,15,110,226
+ mov QWORD[32+rsp],r8
+ mov QWORD[40+rsp],rax
+
+$L$powerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+
+ mov r9,r10
+ mov rdi,rsi
+DB 102,72,15,126,209
+DB 102,72,15,126,226
+ mov rax,QWORD[40+rsp]
+
+ call mulx4x_internal
+
+ mov rsi,QWORD[40+rsp]
+
+ mov rax,1
+
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$powerx5_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_bn_powerx5:
+
+global bn_sqrx8x_internal
+
+
+ALIGN 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lea rdi,[((48+8))+rsp]
+ lea rbp,[r9*1+rsi]
+ mov QWORD[((0+8))+rsp],r9
+ mov QWORD[((8+8))+rsp],rbp
+ jmp NEAR $L$sqr8x_zero_start
+
+ALIGN 32
+ DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+$L$sqrx8x_zero:
+ DB 0x3e
+ movdqa XMMWORD[rdi],xmm0
+ movdqa XMMWORD[16+rdi],xmm0
+ movdqa XMMWORD[32+rdi],xmm0
+ movdqa XMMWORD[48+rdi],xmm0
+$L$sqr8x_zero_start:
+ movdqa XMMWORD[64+rdi],xmm0
+ movdqa XMMWORD[80+rdi],xmm0
+ movdqa XMMWORD[96+rdi],xmm0
+ movdqa XMMWORD[112+rdi],xmm0
+ lea rdi,[128+rdi]
+ sub r9,64
+ jnz NEAR $L$sqrx8x_zero
+
+ mov rdx,QWORD[rsi]
+
+ xor r10,r10
+ xor r11,r11
+ xor r12,r12
+ xor r13,r13
+ xor r14,r14
+ xor r15,r15
+ lea rdi,[((48+8))+rsp]
+ xor rbp,rbp
+ jmp NEAR $L$sqrx8x_outer_loop
+
+ALIGN 32
+$L$sqrx8x_outer_loop:
+ mulx rax,r8,QWORD[8+rsi]
+ adcx r8,r9
+ adox r10,rax
+ mulx rax,r9,QWORD[16+rsi]
+ adcx r9,r10
+ adox r11,rax
+ DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+ adcx r10,r11
+ adox r12,rax
+ DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+ adcx r11,r12
+ adox r13,rax
+ mulx rax,r12,QWORD[40+rsi]
+ adcx r12,r13
+ adox r14,rax
+ mulx rax,r13,QWORD[48+rsi]
+ adcx r13,r14
+ adox rax,r15
+ mulx r15,r14,QWORD[56+rsi]
+ mov rdx,QWORD[8+rsi]
+ adcx r14,rax
+ adox r15,rbp
+ adc r15,QWORD[64+rdi]
+ mov QWORD[8+rdi],r8
+ mov QWORD[16+rdi],r9
+ sbb rcx,rcx
+ xor rbp,rbp
+
+
+ mulx rbx,r8,QWORD[16+rsi]
+ mulx rax,r9,QWORD[24+rsi]
+ adcx r8,r10
+ adox r9,rbx
+ mulx rbx,r10,QWORD[32+rsi]
+ adcx r9,r11
+ adox r10,rax
+ DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+ adcx r10,r12
+ adox r11,rbx
+ DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adcx r11,r13
+ adox r12,r14
+ DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+ mov rdx,QWORD[16+rsi]
+ adcx r12,rax
+ adox r13,rbx
+ adcx r13,r15
+ adox r14,rbp
+ adcx r14,rbp
+
+ mov QWORD[24+rdi],r8
+ mov QWORD[32+rdi],r9
+
+ mulx rbx,r8,QWORD[24+rsi]
+ mulx rax,r9,QWORD[32+rsi]
+ adcx r8,r10
+ adox r9,rbx
+ mulx rbx,r10,QWORD[40+rsi]
+ adcx r9,r11
+ adox r10,rax
+ DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+ adcx r10,r12
+ adox r11,r13
+ DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+ DB 0x3e
+ mov rdx,QWORD[24+rsi]
+ adcx r11,rbx
+ adox r12,rax
+ adcx r12,r14
+ mov QWORD[40+rdi],r8
+ mov QWORD[48+rdi],r9
+ mulx rax,r8,QWORD[32+rsi]
+ adox r13,rbp
+ adcx r13,rbp
+
+ mulx rbx,r9,QWORD[40+rsi]
+ adcx r8,r10
+ adox r9,rax
+ mulx rax,r10,QWORD[48+rsi]
+ adcx r9,r11
+ adox r10,r12
+ mulx r12,r11,QWORD[56+rsi]
+ mov rdx,QWORD[32+rsi]
+ mov r14,QWORD[40+rsi]
+ adcx r10,rbx
+ adox r11,rax
+ mov r15,QWORD[48+rsi]
+ adcx r11,r13
+ adox r12,rbp
+ adcx r12,rbp
+
+ mov QWORD[56+rdi],r8
+ mov QWORD[64+rdi],r9
+
+ mulx rax,r9,r14
+ mov r8,QWORD[56+rsi]
+ adcx r9,r10
+ mulx rbx,r10,r15
+ adox r10,rax
+ adcx r10,r11
+ mulx rax,r11,r8
+ mov rdx,r14
+ adox r11,rbx
+ adcx r11,r12
+
+ adcx rax,rbp
+
+ mulx rbx,r14,r15
+ mulx r13,r12,r8
+ mov rdx,r15
+ lea rsi,[64+rsi]
+ adcx r11,r14
+ adox r12,rbx
+ adcx r12,rax
+ adox r13,rbp
+
+ DB 0x67,0x67
+ mulx r14,r8,r8
+ adcx r13,r8
+ adcx r14,rbp
+
+ cmp rsi,QWORD[((8+8))+rsp]
+ je NEAR $L$sqrx8x_outer_break
+
+ neg rcx
+ mov rcx,-8
+ mov r15,rbp
+ mov r8,QWORD[64+rdi]
+ adcx r9,QWORD[72+rdi]
+ adcx r10,QWORD[80+rdi]
+ adcx r11,QWORD[88+rdi]
+ adc r12,QWORD[96+rdi]
+ adc r13,QWORD[104+rdi]
+ adc r14,QWORD[112+rdi]
+ adc r15,QWORD[120+rdi]
+ lea rbp,[rsi]
+ lea rdi,[128+rdi]
+ sbb rax,rax
+
+ mov rdx,QWORD[((-64))+rsi]
+ mov QWORD[((16+8))+rsp],rax
+ mov QWORD[((24+8))+rsp],rdi
+
+
+ xor eax,eax
+ jmp NEAR $L$sqrx8x_loop
+
+ALIGN 32
+$L$sqrx8x_loop:
+ mov rbx,r8
+ mulx r8,rax,QWORD[rbp]
+ adcx rbx,rax
+ adox r8,r9
+
+ mulx r9,rax,QWORD[8+rbp]
+ adcx r8,rax
+ adox r9,r10
+
+ mulx r10,rax,QWORD[16+rbp]
+ adcx r9,rax
+ adox r10,r11
+
+ mulx r11,rax,QWORD[24+rbp]
+ adcx r10,rax
+ adox r11,r12
+
+ DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcx r11,rax
+ adox r12,r13
+
+ mulx r13,rax,QWORD[40+rbp]
+ adcx r12,rax
+ adox r13,r14
+
+ mulx r14,rax,QWORD[48+rbp]
+ mov QWORD[rcx*8+rdi],rbx
+ mov ebx,0
+ adcx r13,rax
+ adox r14,r15
+
+ DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+ mov rdx,QWORD[8+rcx*8+rsi]
+ adcx r14,rax
+ adox r15,rbx
+ adcx r15,rbx
+
+ DB 0x67
+ inc rcx
+ jnz NEAR $L$sqrx8x_loop
+
+ lea rbp,[64+rbp]
+ mov rcx,-8
+ cmp rbp,QWORD[((8+8))+rsp]
+ je NEAR $L$sqrx8x_break
+
+ sub rbx,QWORD[((16+8))+rsp]
+ DB 0x66
+ mov rdx,QWORD[((-64))+rsi]
+ adcx r8,QWORD[rdi]
+ adcx r9,QWORD[8+rdi]
+ adc r10,QWORD[16+rdi]
+ adc r11,QWORD[24+rdi]
+ adc r12,QWORD[32+rdi]
+ adc r13,QWORD[40+rdi]
+ adc r14,QWORD[48+rdi]
+ adc r15,QWORD[56+rdi]
+ lea rdi,[64+rdi]
+ DB 0x67
+ sbb rax,rax
+ xor ebx,ebx
+ mov QWORD[((16+8))+rsp],rax
+ jmp NEAR $L$sqrx8x_loop
+
+ALIGN 32
+$L$sqrx8x_break:
+ xor rbp,rbp
+ sub rbx,QWORD[((16+8))+rsp]
+ adcx r8,rbp
+ mov rcx,QWORD[((24+8))+rsp]
+ adcx r9,rbp
+ mov rdx,QWORD[rsi]
+ adc r10,0
+ mov QWORD[rdi],r8
+ adc r11,0
+ adc r12,0
+ adc r13,0
+ adc r14,0
+ adc r15,0
+ cmp rdi,rcx
+ je NEAR $L$sqrx8x_outer_loop
+
+ mov QWORD[8+rdi],r9
+ mov r9,QWORD[8+rcx]
+ mov QWORD[16+rdi],r10
+ mov r10,QWORD[16+rcx]
+ mov QWORD[24+rdi],r11
+ mov r11,QWORD[24+rcx]
+ mov QWORD[32+rdi],r12
+ mov r12,QWORD[32+rcx]
+ mov QWORD[40+rdi],r13
+ mov r13,QWORD[40+rcx]
+ mov QWORD[48+rdi],r14
+ mov r14,QWORD[48+rcx]
+ mov QWORD[56+rdi],r15
+ mov r15,QWORD[56+rcx]
+ mov rdi,rcx
+ jmp NEAR $L$sqrx8x_outer_loop
+
+ALIGN 32
+$L$sqrx8x_outer_break:
+ mov QWORD[72+rdi],r9
+DB 102,72,15,126,217
+ mov QWORD[80+rdi],r10
+ mov QWORD[88+rdi],r11
+ mov QWORD[96+rdi],r12
+ mov QWORD[104+rdi],r13
+ mov QWORD[112+rdi],r14
+ lea rdi,[((48+8))+rsp]
+ mov rdx,QWORD[rcx*1+rsi]
+
+ mov r11,QWORD[8+rdi]
+ xor r10,r10
+ mov r9,QWORD[((0+8))+rsp]
+ adox r11,r11
+ mov r12,QWORD[16+rdi]
+ mov r13,QWORD[24+rdi]
+
+
+ALIGN 32
+$L$sqrx4x_shift_n_add:
+ mulx rbx,rax,rdx
+ adox r12,r12
+ adcx rax,r10
+ DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+ DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+ adox r13,r13
+ adcx rbx,r11
+ mov r11,QWORD[40+rdi]
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+
+ mulx rbx,rax,rdx
+ adox r10,r10
+ adcx rax,r12
+ mov rdx,QWORD[16+rcx*1+rsi]
+ mov r12,QWORD[48+rdi]
+ adox r11,r11
+ adcx rbx,r13
+ mov r13,QWORD[56+rdi]
+ mov QWORD[16+rdi],rax
+ mov QWORD[24+rdi],rbx
+
+ mulx rbx,rax,rdx
+ adox r12,r12
+ adcx rax,r10
+ mov rdx,QWORD[24+rcx*1+rsi]
+ lea rcx,[32+rcx]
+ mov r10,QWORD[64+rdi]
+ adox r13,r13
+ adcx rbx,r11
+ mov r11,QWORD[72+rdi]
+ mov QWORD[32+rdi],rax
+ mov QWORD[40+rdi],rbx
+
+ mulx rbx,rax,rdx
+ adox r10,r10
+ adcx rax,r12
+ jrcxz $L$sqrx4x_shift_n_add_break
+ DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+ adox r11,r11
+ adcx rbx,r13
+ mov r12,QWORD[80+rdi]
+ mov r13,QWORD[88+rdi]
+ mov QWORD[48+rdi],rax
+ mov QWORD[56+rdi],rbx
+ lea rdi,[64+rdi]
+ nop
+ jmp NEAR $L$sqrx4x_shift_n_add
+
+ALIGN 32
+$L$sqrx4x_shift_n_add_break:
+ adcx rbx,r13
+ mov QWORD[48+rdi],rax
+ mov QWORD[56+rdi],rbx
+ lea rdi,[64+rdi]
+DB 102,72,15,126,213
+__bn_sqrx8x_reduction:
+ xor eax,eax
+ mov rbx,QWORD[((32+8))+rsp]
+ mov rdx,QWORD[((48+8))+rsp]
+ lea rcx,[((-64))+r9*1+rbp]
+
+ mov QWORD[((0+8))+rsp],rcx
+ mov QWORD[((8+8))+rsp],rdi
+
+ lea rdi,[((48+8))+rsp]
+ jmp NEAR $L$sqrx8x_reduction_loop
+
+ALIGN 32
+$L$sqrx8x_reduction_loop:
+ mov r9,QWORD[8+rdi]
+ mov r10,QWORD[16+rdi]
+ mov r11,QWORD[24+rdi]
+ mov r12,QWORD[32+rdi]
+ mov r8,rdx
+ imul rdx,rbx
+ mov r13,QWORD[40+rdi]
+ mov r14,QWORD[48+rdi]
+ mov r15,QWORD[56+rdi]
+ mov QWORD[((24+8))+rsp],rax
+
+ lea rdi,[64+rdi]
+ xor rsi,rsi
+ mov rcx,-8
+ jmp NEAR $L$sqrx8x_reduce
+
+ALIGN 32
+$L$sqrx8x_reduce:
+ mov rbx,r8
+ mulx r8,rax,QWORD[rbp]
+ adcx rax,rbx
+ adox r8,r9
+
+ mulx r9,rbx,QWORD[8+rbp]
+ adcx r8,rbx
+ adox r9,r10
+
+ mulx r10,rbx,QWORD[16+rbp]
+ adcx r9,rbx
+ adox r10,r11
+
+ mulx r11,rbx,QWORD[24+rbp]
+ adcx r10,rbx
+ adox r11,r12
+
+ DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ mov rax,rdx
+ mov rdx,r8
+ adcx r11,rbx
+ adox r12,r13
+
+ mulx rdx,rbx,QWORD[((32+8))+rsp]
+ mov rdx,rax
+ mov QWORD[((64+48+8))+rcx*8+rsp],rax
+
+ mulx r13,rax,QWORD[40+rbp]
+ adcx r12,rax
+ adox r13,r14
+
+ mulx r14,rax,QWORD[48+rbp]
+ adcx r13,rax
+ adox r14,r15
+
+ mulx r15,rax,QWORD[56+rbp]
+ mov rdx,rbx
+ adcx r14,rax
+ adox r15,rsi
+ adcx r15,rsi
+
+ DB 0x67,0x67,0x67
+ inc rcx
+ jnz NEAR $L$sqrx8x_reduce
+
+ mov rax,rsi
+ cmp rbp,QWORD[((0+8))+rsp]
+ jae NEAR $L$sqrx8x_no_tail
+
+ mov rdx,QWORD[((48+8))+rsp]
+ add r8,QWORD[rdi]
+ lea rbp,[64+rbp]
+ mov rcx,-8
+ adcx r9,QWORD[8+rdi]
+ adcx r10,QWORD[16+rdi]
+ adc r11,QWORD[24+rdi]
+ adc r12,QWORD[32+rdi]
+ adc r13,QWORD[40+rdi]
+ adc r14,QWORD[48+rdi]
+ adc r15,QWORD[56+rdi]
+ lea rdi,[64+rdi]
+ sbb rax,rax
+
+ xor rsi,rsi
+ mov QWORD[((16+8))+rsp],rax
+ jmp NEAR $L$sqrx8x_tail
+
+ALIGN 32
+$L$sqrx8x_tail:
+ mov rbx,r8
+ mulx r8,rax,QWORD[rbp]
+ adcx rbx,rax
+ adox r8,r9
+
+ mulx r9,rax,QWORD[8+rbp]
+ adcx r8,rax
+ adox r9,r10
+
+ mulx r10,rax,QWORD[16+rbp]
+ adcx r9,rax
+ adox r10,r11
+
+ mulx r11,rax,QWORD[24+rbp]
+ adcx r10,rax
+ adox r11,r12
+
+ DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcx r11,rax
+ adox r12,r13
+
+ mulx r13,rax,QWORD[40+rbp]
+ adcx r12,rax
+ adox r13,r14
+
+ mulx r14,rax,QWORD[48+rbp]
+ adcx r13,rax
+ adox r14,r15
+
+ mulx r15,rax,QWORD[56+rbp]
+ mov rdx,QWORD[((72+48+8))+rcx*8+rsp]
+ adcx r14,rax
+ adox r15,rsi
+ mov QWORD[rcx*8+rdi],rbx
+ mov rbx,r8
+ adcx r15,rsi
+
+ inc rcx
+ jnz NEAR $L$sqrx8x_tail
+
+ cmp rbp,QWORD[((0+8))+rsp]
+ jae NEAR $L$sqrx8x_tail_done
+
+ sub rsi,QWORD[((16+8))+rsp]
+ mov rdx,QWORD[((48+8))+rsp]
+ lea rbp,[64+rbp]
+ adc r8,QWORD[rdi]
+ adc r9,QWORD[8+rdi]
+ adc r10,QWORD[16+rdi]
+ adc r11,QWORD[24+rdi]
+ adc r12,QWORD[32+rdi]
+ adc r13,QWORD[40+rdi]
+ adc r14,QWORD[48+rdi]
+ adc r15,QWORD[56+rdi]
+ lea rdi,[64+rdi]
+ sbb rax,rax
+ sub rcx,8
+
+ xor rsi,rsi
+ mov QWORD[((16+8))+rsp],rax
+ jmp NEAR $L$sqrx8x_tail
+
+ALIGN 32
+$L$sqrx8x_tail_done:
+ xor rax,rax
+ add r8,QWORD[((24+8))+rsp]
+ adc r9,0
+ adc r10,0
+ adc r11,0
+ adc r12,0
+ adc r13,0
+ adc r14,0
+ adc r15,0
+ adc rax,0
+
+ sub rsi,QWORD[((16+8))+rsp]
+$L$sqrx8x_no_tail:
+ adc r8,QWORD[rdi]
+DB 102,72,15,126,217
+ adc r9,QWORD[8+rdi]
+ mov rsi,QWORD[56+rbp]
+DB 102,72,15,126,213
+ adc r10,QWORD[16+rdi]
+ adc r11,QWORD[24+rdi]
+ adc r12,QWORD[32+rdi]
+ adc r13,QWORD[40+rdi]
+ adc r14,QWORD[48+rdi]
+ adc r15,QWORD[56+rdi]
+ adc rax,0
+
+ mov rbx,QWORD[((32+8))+rsp]
+ mov rdx,QWORD[64+rcx*1+rdi]
+
+ mov QWORD[rdi],r8
+ lea r8,[64+rdi]
+ mov QWORD[8+rdi],r9
+ mov QWORD[16+rdi],r10
+ mov QWORD[24+rdi],r11
+ mov QWORD[32+rdi],r12
+ mov QWORD[40+rdi],r13
+ mov QWORD[48+rdi],r14
+ mov QWORD[56+rdi],r15
+
+ lea rdi,[64+rcx*1+rdi]
+ cmp r8,QWORD[((8+8))+rsp]
+ jb NEAR $L$sqrx8x_reduction_loop
+ ret
+
+
+ALIGN 32
+
+__bn_postx4x_internal:
+
+ mov r12,QWORD[rbp]
+ mov r10,rcx
+ mov r9,rcx
+ neg rax
+ sar rcx,3+2
+
+DB 102,72,15,126,202
+DB 102,72,15,126,206
+ dec r12
+ mov r13,QWORD[8+rbp]
+ xor r8,r8
+ mov r14,QWORD[16+rbp]
+ mov r15,QWORD[24+rbp]
+ jmp NEAR $L$sqrx4x_sub_entry
+
+ALIGN 16
+$L$sqrx4x_sub:
+ mov r12,QWORD[rbp]
+ mov r13,QWORD[8+rbp]
+ mov r14,QWORD[16+rbp]
+ mov r15,QWORD[24+rbp]
+$L$sqrx4x_sub_entry:
+ andn r12,r12,rax
+ lea rbp,[32+rbp]
+ andn r13,r13,rax
+ andn r14,r14,rax
+ andn r15,r15,rax
+
+ neg r8
+ adc r12,QWORD[rdi]
+ adc r13,QWORD[8+rdi]
+ adc r14,QWORD[16+rdi]
+ adc r15,QWORD[24+rdi]
+ mov QWORD[rdx],r12
+ lea rdi,[32+rdi]
+ mov QWORD[8+rdx],r13
+ sbb r8,r8
+ mov QWORD[16+rdx],r14
+ mov QWORD[24+rdx],r15
+ lea rdx,[32+rdx]
+
+ inc rcx
+ jnz NEAR $L$sqrx4x_sub
+
+ neg r9
+
+ ret
+
+
+global bn_scatter5
+
+ALIGN 16
+bn_scatter5:
+
+_CET_ENDBR
+ cmp edx,0
+ jz NEAR $L$scatter_epilogue
+
+
+
+
+
+
+
+
+
+ lea r8,[r9*8+r8]
+$L$scatter:
+ mov rax,QWORD[rcx]
+ lea rcx,[8+rcx]
+ mov QWORD[r8],rax
+ lea r8,[256+r8]
+ sub edx,1
+ jnz NEAR $L$scatter
+$L$scatter_epilogue:
+ ret
+
+
+
+global bn_gather5
+
+ALIGN 32
+bn_gather5:
+
+$L$SEH_begin_bn_gather5:
+_CET_ENDBR
+
+ DB 0x4c,0x8d,0x14,0x24
+
+ DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ lea rax,[$L$inc]
+ and rsp,-16
+
+ movd xmm5,r9d
+ movdqa xmm0,XMMWORD[rax]
+ movdqa xmm1,XMMWORD[16+rax]
+ lea r11,[128+r8]
+ lea rax,[128+rsp]
+
+ pshufd xmm5,xmm5,0
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[(-128)+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[(-112)+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[(-96)+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[(-80)+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[(-64)+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[(-48)+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[(-32)+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[(-16)+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[16+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[32+rax],xmm2
+ movdqa xmm2,xmm4
+ paddd xmm1,xmm0
+ pcmpeqd xmm0,xmm5
+ movdqa XMMWORD[48+rax],xmm3
+ movdqa xmm3,xmm4
+
+ paddd xmm2,xmm1
+ pcmpeqd xmm1,xmm5
+ movdqa XMMWORD[64+rax],xmm0
+ movdqa xmm0,xmm4
+
+ paddd xmm3,xmm2
+ pcmpeqd xmm2,xmm5
+ movdqa XMMWORD[80+rax],xmm1
+ movdqa xmm1,xmm4
+
+ paddd xmm0,xmm3
+ pcmpeqd xmm3,xmm5
+ movdqa XMMWORD[96+rax],xmm2
+ movdqa xmm2,xmm4
+ movdqa XMMWORD[112+rax],xmm3
+ jmp NEAR $L$gather
+
+ALIGN 32
+$L$gather:
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movdqa xmm0,XMMWORD[((-128))+r11]
+ movdqa xmm1,XMMWORD[((-112))+r11]
+ movdqa xmm2,XMMWORD[((-96))+r11]
+ pand xmm0,XMMWORD[((-128))+rax]
+ movdqa xmm3,XMMWORD[((-80))+r11]
+ pand xmm1,XMMWORD[((-112))+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[((-96))+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[((-80))+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[((-64))+r11]
+ movdqa xmm1,XMMWORD[((-48))+r11]
+ movdqa xmm2,XMMWORD[((-32))+r11]
+ pand xmm0,XMMWORD[((-64))+rax]
+ movdqa xmm3,XMMWORD[((-16))+r11]
+ pand xmm1,XMMWORD[((-48))+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[((-32))+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[((-16))+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[r11]
+ movdqa xmm1,XMMWORD[16+r11]
+ movdqa xmm2,XMMWORD[32+r11]
+ pand xmm0,XMMWORD[rax]
+ movdqa xmm3,XMMWORD[48+r11]
+ pand xmm1,XMMWORD[16+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[32+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[48+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ movdqa xmm0,XMMWORD[64+r11]
+ movdqa xmm1,XMMWORD[80+r11]
+ movdqa xmm2,XMMWORD[96+r11]
+ pand xmm0,XMMWORD[64+rax]
+ movdqa xmm3,XMMWORD[112+r11]
+ pand xmm1,XMMWORD[80+rax]
+ por xmm4,xmm0
+ pand xmm2,XMMWORD[96+rax]
+ por xmm5,xmm1
+ pand xmm3,XMMWORD[112+rax]
+ por xmm4,xmm2
+ por xmm5,xmm3
+ por xmm4,xmm5
+ lea r11,[256+r11]
+
+ pshufd xmm0,xmm4,0x4e
+ por xmm0,xmm4
+ movq QWORD[rcx],xmm0
+ lea rcx,[8+rcx]
+ sub edx,1
+ jnz NEAR $L$gather
+
+ lea rsp,[r10]
+
+ ret
+$L$SEH_end_bn_gather5:
+
+
+section .rdata rdata align=8
+ALIGN 64
+$L$inc:
+ DD 0,0,1,1
+ DD 2,2,2,2
+ DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+ DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
+ DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
+ DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
+ DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
+ DB 112,101,110,115,115,108,46,111,114,103,62,0
+section .text
+
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+mul_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_pop_regs
+
+ mov rax,QWORD[152+r8]
+
+ mov r10d,DWORD[8+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea r10,[$L$mul_epilogue]
+ cmp rbx,r10
+ ja NEAR $L$body_40
+
+ mov r10,QWORD[192+r8]
+ mov rax,QWORD[8+r10*8+rax]
+
+ jmp NEAR $L$common_pop_regs
+
+$L$body_40:
+ mov rax,QWORD[40+rax]
+$L$common_pop_regs:
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
+ DD $L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
+ DD $L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
+
+ DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
+ DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
+ DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
+
+ DD $L$SEH_begin_bn_power5 wrt ..imagebase
+ DD $L$SEH_end_bn_power5 wrt ..imagebase
+ DD $L$SEH_info_bn_power5 wrt ..imagebase
+ DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
+ DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
+ DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
+
+ DD $L$SEH_begin_bn_powerx5 wrt ..imagebase
+ DD $L$SEH_end_bn_powerx5 wrt ..imagebase
+ DD $L$SEH_info_bn_powerx5 wrt ..imagebase
+ DD $L$SEH_begin_bn_gather5 wrt ..imagebase
+ DD $L$SEH_end_bn_gather5 wrt ..imagebase
+ DD $L$SEH_info_bn_gather5 wrt ..imagebase
+
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_bn_mul_mont_gather5:
+ DB 9,0,0,0
+ DD mul_handler wrt ..imagebase
+ DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+ALIGN 8
+$L$SEH_info_bn_mul4x_mont_gather5:
+ DB 9,0,0,0
+ DD mul_handler wrt ..imagebase
+ DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+ALIGN 8
+$L$SEH_info_bn_power5:
+ DB 9,0,0,0
+ DD mul_handler wrt ..imagebase
+ DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
+ALIGN 8
+$L$SEH_info_bn_mulx4x_mont_gather5:
+ DB 9,0,0,0
+ DD mul_handler wrt ..imagebase
+ DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
+ALIGN 8
+$L$SEH_info_bn_powerx5:
+ DB 9,0,0,0
+ DD mul_handler wrt ..imagebase
+ DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
+ALIGN 8
+$L$SEH_info_bn_gather5:
+ DB 0x01,0x0b,0x03,0x0a
+ DB 0x0b,0x01,0x21,0x00
+ DB 0x04,0xa3,0x00,0x00
+ALIGN 8
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/aes128gcmsiv-x86_64-apple.S b/gen/crypto/aes128gcmsiv-x86_64-apple.S
new file mode 100644
index 0000000..81e2f07
--- /dev/null
+++ b/gen/crypto/aes128gcmsiv-x86_64-apple.S
@@ -0,0 +1,3081 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.section __DATA,__const
+
+.p2align 4
+one:
+.quad 1,0
+two:
+.quad 2,0
+three:
+.quad 3,0
+four:
+.quad 4,0
+five:
+.quad 5,0
+six:
+.quad 6,0
+seven:
+.quad 7,0
+eight:
+.quad 8,0
+
+OR_MASK:
+.long 0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+.quad 0x1, 0xc200000000000000
+mask:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+.long 1,1,1,1
+con2:
+.long 0x1b,0x1b,0x1b,0x1b
+con3:
+.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+.long 0,0xffffffff, 0xffffffff, 0xffffffff
+.text
+
+.p2align 4
+GFMUL:
+
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm3,%xmm5,%xmm5
+
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
+ vpshufd $78,%xmm2,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
+ vpshufd $78,%xmm2,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+
+ vpxor %xmm5,%xmm2,%xmm0
+ ret
+
+
+.globl _aesgcmsiv_htable_init
+.private_extern _aesgcmsiv_htable_init
+
+.p2align 4
+_aesgcmsiv_htable_init:
+
+_CET_ENDBR
+ vmovdqa (%rsi),%xmm0
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm0,(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,16(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,32(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,48(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,64(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,80(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,96(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,112(%rdi)
+ ret
+
+
+.globl _aesgcmsiv_htable6_init
+.private_extern _aesgcmsiv_htable6_init
+
+.p2align 4
+_aesgcmsiv_htable6_init:
+
+_CET_ENDBR
+ vmovdqa (%rsi),%xmm0
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm0,(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,16(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,32(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,48(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,64(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,80(%rdi)
+ ret
+
+
+.globl _aesgcmsiv_htable_polyval
+.private_extern _aesgcmsiv_htable_polyval
+
+.p2align 4
+_aesgcmsiv_htable_polyval:
+
+_CET_ENDBR
+ testq %rdx,%rdx
+ jnz L$htable_polyval_start
+ ret
+
+L$htable_polyval_start:
+ vzeroall
+
+
+
+ movq %rdx,%r11
+ andq $127,%r11
+
+ jz L$htable_polyval_no_prefix
+
+ vpxor %xmm9,%xmm9,%xmm9
+ vmovdqa (%rcx),%xmm1
+ subq %r11,%rdx
+
+ subq $16,%r11
+
+
+ vmovdqu (%rsi),%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+ leaq 16(%rsi),%rsi
+ testq %r11,%r11
+ jnz L$htable_polyval_prefix_loop
+ jmp L$htable_polyval_prefix_complete
+
+
+.p2align 6
+L$htable_polyval_prefix_loop:
+ subq $16,%r11
+
+ vmovdqu (%rsi),%xmm0
+
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+ testq %r11,%r11
+
+ leaq 16(%rsi),%rsi
+
+ jnz L$htable_polyval_prefix_loop
+
+L$htable_polyval_prefix_complete:
+ vpsrldq $8,%xmm5,%xmm6
+ vpslldq $8,%xmm5,%xmm5
+
+ vpxor %xmm6,%xmm4,%xmm9
+ vpxor %xmm5,%xmm3,%xmm1
+
+ jmp L$htable_polyval_main_loop
+
+L$htable_polyval_no_prefix:
+
+
+
+
+ vpxor %xmm1,%xmm1,%xmm1
+ vmovdqa (%rcx),%xmm9
+
+.p2align 6
+L$htable_polyval_main_loop:
+ subq $0x80,%rdx
+ jb L$htable_polyval_out
+
+ vmovdqu 112(%rsi),%xmm0
+
+ vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5
+ vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3
+ vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4
+ vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 96(%rsi),%xmm0
+ vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+
+ vmovdqu 80(%rsi),%xmm0
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+
+ vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm7,%xmm1,%xmm1
+
+ vmovdqu 64(%rsi),%xmm0
+
+ vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 48(%rsi),%xmm0
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+
+ vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm7,%xmm1,%xmm1
+
+ vmovdqu 32(%rsi),%xmm0
+
+ vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vmovdqu 16(%rsi),%xmm0
+
+ vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 0(%rsi),%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpsrldq $8,%xmm5,%xmm6
+ vpslldq $8,%xmm5,%xmm5
+
+ vpxor %xmm6,%xmm4,%xmm9
+ vpxor %xmm5,%xmm3,%xmm1
+
+ leaq 128(%rsi),%rsi
+ jmp L$htable_polyval_main_loop
+
+
+
+L$htable_polyval_out:
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+ vpxor %xmm6,%xmm1,%xmm1
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+ vpxor %xmm6,%xmm1,%xmm1
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rcx)
+ vzeroupper
+ ret
+
+
+.globl _aesgcmsiv_polyval_horner
+.private_extern _aesgcmsiv_polyval_horner
+
+.p2align 4
+_aesgcmsiv_polyval_horner:
+
+_CET_ENDBR
+ testq %rcx,%rcx
+ jnz L$polyval_horner_start
+ ret
+
+L$polyval_horner_start:
+
+
+
+ xorq %r10,%r10
+ shlq $4,%rcx
+
+ vmovdqa (%rsi),%xmm1
+ vmovdqa (%rdi),%xmm0
+
+L$polyval_horner_loop:
+ vpxor (%rdx,%r10,1),%xmm0,%xmm0
+ call GFMUL
+
+ addq $16,%r10
+ cmpq %r10,%rcx
+ jne L$polyval_horner_loop
+
+
+ vmovdqa %xmm0,(%rdi)
+ ret
+
+
+.globl _aes128gcmsiv_aes_ks
+.private_extern _aes128gcmsiv_aes_ks
+
+.p2align 4
+_aes128gcmsiv_aes_ks:
+
+_CET_ENDBR
+ vmovdqu (%rdi),%xmm1
+ vmovdqa %xmm1,(%rsi)
+
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+
+ movq $8,%rax
+
+L$ks128_loop:
+ addq $16,%rsi
+ subq $1,%rax
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ jne L$ks128_loop
+
+ vmovdqa con2(%rip),%xmm0
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,16(%rsi)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,32(%rsi)
+ ret
+
+
+.globl _aes256gcmsiv_aes_ks
+.private_extern _aes256gcmsiv_aes_ks
+
+.p2align 4
+_aes256gcmsiv_aes_ks:
+
+_CET_ENDBR
+ vmovdqu (%rdi),%xmm1
+ vmovdqu 16(%rdi),%xmm3
+ vmovdqa %xmm1,(%rsi)
+ vmovdqa %xmm3,16(%rsi)
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+ vpxor %xmm14,%xmm14,%xmm14
+ movq $6,%rax
+
+L$ks256_loop:
+ addq $32,%rsi
+ subq $1,%rax
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpshufb con3(%rip),%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vmovdqa %xmm3,16(%rsi)
+ jne L$ks256_loop
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpsllq $32,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,32(%rsi)
+ ret
+
+.globl _aes128gcmsiv_aes_ks_enc_x1
+.private_extern _aes128gcmsiv_aes_ks_enc_x1
+
+.p2align 4
+_aes128gcmsiv_aes_ks_enc_x1:
+
+_CET_ENDBR
+ vmovdqa (%rcx),%xmm1
+ vmovdqa 0(%rdi),%xmm4
+
+ vmovdqa %xmm1,(%rdx)
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,16(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,32(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,48(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,64(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,80(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,96(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,112(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,128(%rdx)
+
+
+ vmovdqa con2(%rip),%xmm0
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,144(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenclast %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,160(%rdx)
+
+
+ vmovdqa %xmm4,0(%rsi)
+ ret
+
+
+.globl _aes128gcmsiv_kdf
+.private_extern _aes128gcmsiv_kdf
+
+.p2align 4
+_aes128gcmsiv_kdf:
+
+_CET_ENDBR
+
+
+
+
+ vmovdqa (%rdx),%xmm1
+ vmovdqa 0(%rdi),%xmm9
+ vmovdqa and_mask(%rip),%xmm12
+ vmovdqa one(%rip),%xmm13
+ vpshufd $0x90,%xmm9,%xmm9
+ vpand %xmm12,%xmm9,%xmm9
+ vpaddd %xmm13,%xmm9,%xmm10
+ vpaddd %xmm13,%xmm10,%xmm11
+ vpaddd %xmm13,%xmm11,%xmm12
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpxor %xmm1,%xmm10,%xmm10
+ vpxor %xmm1,%xmm11,%xmm11
+ vpxor %xmm1,%xmm12,%xmm12
+
+ vmovdqa 16(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 32(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 48(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 64(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 80(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 96(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 112(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 128(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 144(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 160(%rdx),%xmm2
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vaesenclast %xmm2,%xmm10,%xmm10
+ vaesenclast %xmm2,%xmm11,%xmm11
+ vaesenclast %xmm2,%xmm12,%xmm12
+
+
+ vmovdqa %xmm9,0(%rsi)
+ vmovdqa %xmm10,16(%rsi)
+ vmovdqa %xmm11,32(%rsi)
+ vmovdqa %xmm12,48(%rsi)
+ ret
+
+
+.globl _aes128gcmsiv_enc_msg_x4
+.private_extern _aes128gcmsiv_enc_msg_x4
+
+.p2align 4
+_aes128gcmsiv_enc_msg_x4:
+
+_CET_ENDBR
+ testq %r8,%r8
+ jnz L$128_enc_msg_x4_start
+ ret
+
+L$128_enc_msg_x4_start:
+ pushq %r12
+
+ pushq %r13
+
+
+ shrq $4,%r8
+ movq %r8,%r10
+ shlq $62,%r10
+ shrq $62,%r10
+
+
+ vmovdqa (%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+
+ vmovdqu four(%rip),%xmm4
+ vmovdqa %xmm15,%xmm0
+ vpaddd one(%rip),%xmm15,%xmm1
+ vpaddd two(%rip),%xmm15,%xmm2
+ vpaddd three(%rip),%xmm15,%xmm3
+
+ shrq $2,%r8
+ je L$128_enc_msg_x4_check_remainder
+
+ subq $64,%rsi
+ subq $64,%rdi
+
+L$128_enc_msg_x4_loop1:
+ addq $64,%rsi
+ addq $64,%rdi
+
+ vmovdqa %xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vmovdqa %xmm2,%xmm7
+ vmovdqa %xmm3,%xmm8
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqu 32(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm1,%xmm1
+ vmovdqu 48(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm2,%xmm2
+ vmovdqu 64(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm3,%xmm3
+
+ vmovdqu 80(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 96(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 112(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 128(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 144(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm12
+ vaesenclast %xmm12,%xmm5,%xmm5
+ vaesenclast %xmm12,%xmm6,%xmm6
+ vaesenclast %xmm12,%xmm7,%xmm7
+ vaesenclast %xmm12,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm5,%xmm5
+ vpxor 16(%rdi),%xmm6,%xmm6
+ vpxor 32(%rdi),%xmm7,%xmm7
+ vpxor 48(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm5,0(%rsi)
+ vmovdqu %xmm6,16(%rsi)
+ vmovdqu %xmm7,32(%rsi)
+ vmovdqu %xmm8,48(%rsi)
+
+ jne L$128_enc_msg_x4_loop1
+
+ addq $64,%rsi
+ addq $64,%rdi
+
+L$128_enc_msg_x4_check_remainder:
+ cmpq $0,%r10
+ je L$128_enc_msg_x4_out
+
+L$128_enc_msg_x4_loop2:
+
+
+ vmovdqa %xmm0,%xmm5
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vaesenc 16(%rcx),%xmm5,%xmm5
+ vaesenc 32(%rcx),%xmm5,%xmm5
+ vaesenc 48(%rcx),%xmm5,%xmm5
+ vaesenc 64(%rcx),%xmm5,%xmm5
+ vaesenc 80(%rcx),%xmm5,%xmm5
+ vaesenc 96(%rcx),%xmm5,%xmm5
+ vaesenc 112(%rcx),%xmm5,%xmm5
+ vaesenc 128(%rcx),%xmm5,%xmm5
+ vaesenc 144(%rcx),%xmm5,%xmm5
+ vaesenclast 160(%rcx),%xmm5,%xmm5
+
+
+ vpxor (%rdi),%xmm5,%xmm5
+ vmovdqu %xmm5,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ subq $1,%r10
+ jne L$128_enc_msg_x4_loop2
+
+L$128_enc_msg_x4_out:
+ popq %r13
+
+ popq %r12
+
+ ret
+
+
+.globl _aes128gcmsiv_enc_msg_x8
+.private_extern _aes128gcmsiv_enc_msg_x8
+
+.p2align 4
+_aes128gcmsiv_enc_msg_x8:
+
+_CET_ENDBR
+ testq %r8,%r8
+ jnz L$128_enc_msg_x8_start
+ ret
+
+L$128_enc_msg_x8_start:
+ pushq %r12
+
+ pushq %r13
+
+ pushq %rbp
+
+ movq %rsp,%rbp
+
+
+
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shrq $4,%r8
+ movq %r8,%r10
+ shlq $61,%r10
+ shrq $61,%r10
+
+
+ vmovdqu (%rdx),%xmm1
+ vpor OR_MASK(%rip),%xmm1,%xmm1
+
+
+ vpaddd seven(%rip),%xmm1,%xmm0
+ vmovdqu %xmm0,(%rsp)
+ vpaddd one(%rip),%xmm1,%xmm9
+ vpaddd two(%rip),%xmm1,%xmm10
+ vpaddd three(%rip),%xmm1,%xmm11
+ vpaddd four(%rip),%xmm1,%xmm12
+ vpaddd five(%rip),%xmm1,%xmm13
+ vpaddd six(%rip),%xmm1,%xmm14
+ vmovdqa %xmm1,%xmm0
+
+ shrq $3,%r8
+ je L$128_enc_msg_x8_check_remainder
+
+ subq $128,%rsi
+ subq $128,%rdi
+
+L$128_enc_msg_x8_loop1:
+ addq $128,%rsi
+ addq $128,%rdi
+
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm9,%xmm2
+ vmovdqa %xmm10,%xmm3
+ vmovdqa %xmm11,%xmm4
+ vmovdqa %xmm12,%xmm5
+ vmovdqa %xmm13,%xmm6
+ vmovdqa %xmm14,%xmm7
+
+ vmovdqu (%rsp),%xmm8
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vpxor (%rcx),%xmm2,%xmm2
+ vpxor (%rcx),%xmm3,%xmm3
+ vpxor (%rcx),%xmm4,%xmm4
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu (%rsp),%xmm14
+ vpaddd eight(%rip),%xmm14,%xmm14
+ vmovdqu %xmm14,(%rsp)
+ vmovdqu 32(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpsubd one(%rip),%xmm14,%xmm14
+ vmovdqu 48(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm0,%xmm0
+ vmovdqu 64(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm9,%xmm9
+ vmovdqu 80(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm10,%xmm10
+ vmovdqu 96(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm11,%xmm11
+ vmovdqu 112(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm12,%xmm12
+ vmovdqu 128(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm13,%xmm13
+ vmovdqu 144(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm15
+ vaesenclast %xmm15,%xmm1,%xmm1
+ vaesenclast %xmm15,%xmm2,%xmm2
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vaesenclast %xmm15,%xmm6,%xmm6
+ vaesenclast %xmm15,%xmm7,%xmm7
+ vaesenclast %xmm15,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm1,%xmm1
+ vpxor 16(%rdi),%xmm2,%xmm2
+ vpxor 32(%rdi),%xmm3,%xmm3
+ vpxor 48(%rdi),%xmm4,%xmm4
+ vpxor 64(%rdi),%xmm5,%xmm5
+ vpxor 80(%rdi),%xmm6,%xmm6
+ vpxor 96(%rdi),%xmm7,%xmm7
+ vpxor 112(%rdi),%xmm8,%xmm8
+
+ decq %r8
+
+ vmovdqu %xmm1,0(%rsi)
+ vmovdqu %xmm2,16(%rsi)
+ vmovdqu %xmm3,32(%rsi)
+ vmovdqu %xmm4,48(%rsi)
+ vmovdqu %xmm5,64(%rsi)
+ vmovdqu %xmm6,80(%rsi)
+ vmovdqu %xmm7,96(%rsi)
+ vmovdqu %xmm8,112(%rsi)
+
+ jne L$128_enc_msg_x8_loop1
+
+ addq $128,%rsi
+ addq $128,%rdi
+
+L$128_enc_msg_x8_check_remainder:
+ cmpq $0,%r10
+ je L$128_enc_msg_x8_out
+
+L$128_enc_msg_x8_loop2:
+
+
+ vmovdqa %xmm0,%xmm1
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vaesenc 16(%rcx),%xmm1,%xmm1
+ vaesenc 32(%rcx),%xmm1,%xmm1
+ vaesenc 48(%rcx),%xmm1,%xmm1
+ vaesenc 64(%rcx),%xmm1,%xmm1
+ vaesenc 80(%rcx),%xmm1,%xmm1
+ vaesenc 96(%rcx),%xmm1,%xmm1
+ vaesenc 112(%rcx),%xmm1,%xmm1
+ vaesenc 128(%rcx),%xmm1,%xmm1
+ vaesenc 144(%rcx),%xmm1,%xmm1
+ vaesenclast 160(%rcx),%xmm1,%xmm1
+
+
+ vpxor (%rdi),%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ decq %r10
+ jne L$128_enc_msg_x8_loop2
+
+L$128_enc_msg_x8_out:
+ movq %rbp,%rsp
+
+ popq %rbp
+
+ popq %r13
+
+ popq %r12
+
+ ret
+
+
+.globl _aes128gcmsiv_dec
+.private_extern _aes128gcmsiv_dec
+
+.p2align 4
+_aes128gcmsiv_dec:
+
+_CET_ENDBR
+ testq $~15,%r9
+ jnz L$128_dec_start
+ ret
+
+L$128_dec_start:
+ vzeroupper
+ vmovdqa (%rdx),%xmm0
+
+
+ vmovdqu 16(%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+ movq %rdx,%rax
+
+ leaq 32(%rax),%rax
+ leaq 32(%rcx),%rcx
+
+ andq $~15,%r9
+
+
+ cmpq $96,%r9
+ jb L$128_dec_loop2
+
+
+ subq $96,%r9
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vpxor (%r8),%xmm7,%xmm7
+ vpxor (%r8),%xmm8,%xmm8
+ vpxor (%r8),%xmm9,%xmm9
+ vpxor (%r8),%xmm10,%xmm10
+ vpxor (%r8),%xmm11,%xmm11
+ vpxor (%r8),%xmm12,%xmm12
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+
+ vpxor 0(%rdi),%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm12,%xmm12
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ addq $96,%rdi
+ addq $96,%rsi
+ jmp L$128_dec_loop1
+
+
+.p2align 6
+L$128_dec_loop1:
+ cmpq $96,%r9
+ jb L$128_dec_finish_96
+ subq $96,%r9
+
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vmovdqa (%r8),%xmm4
+ vpxor %xmm4,%xmm7,%xmm7
+ vpxor %xmm4,%xmm8,%xmm8
+ vpxor %xmm4,%xmm9,%xmm9
+ vpxor %xmm4,%xmm10,%xmm10
+ vpxor %xmm4,%xmm11,%xmm11
+ vpxor %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vmovdqa 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm6
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor 0(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+ leaq 96(%rdi),%rdi
+ leaq 96(%rsi),%rsi
+ jmp L$128_dec_loop1
+
+L$128_dec_finish_96:
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+L$128_dec_loop2:
+
+
+
+ cmpq $16,%r9
+ jb L$128_dec_out
+ subq $16,%r9
+
+ vmovdqa %xmm15,%xmm2
+ vpaddd one(%rip),%xmm15,%xmm15
+
+ vpxor 0(%r8),%xmm2,%xmm2
+ vaesenc 16(%r8),%xmm2,%xmm2
+ vaesenc 32(%r8),%xmm2,%xmm2
+ vaesenc 48(%r8),%xmm2,%xmm2
+ vaesenc 64(%r8),%xmm2,%xmm2
+ vaesenc 80(%r8),%xmm2,%xmm2
+ vaesenc 96(%r8),%xmm2,%xmm2
+ vaesenc 112(%r8),%xmm2,%xmm2
+ vaesenc 128(%r8),%xmm2,%xmm2
+ vaesenc 144(%r8),%xmm2,%xmm2
+ vaesenclast 160(%r8),%xmm2,%xmm2
+ vpxor (%rdi),%xmm2,%xmm2
+ vmovdqu %xmm2,(%rsi)
+ addq $16,%rdi
+ addq $16,%rsi
+
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa -32(%rcx),%xmm1
+ call GFMUL
+
+ jmp L$128_dec_loop2
+
+L$128_dec_out:
+ vmovdqu %xmm0,(%rdx)
+ ret
+
+
+.globl _aes128gcmsiv_ecb_enc_block
+.private_extern _aes128gcmsiv_ecb_enc_block
+
+.p2align 4
+_aes128gcmsiv_ecb_enc_block:
+
+_CET_ENDBR
+ vmovdqa (%rdi),%xmm1
+
+ vpxor (%rdx),%xmm1,%xmm1
+ vaesenc 16(%rdx),%xmm1,%xmm1
+ vaesenc 32(%rdx),%xmm1,%xmm1
+ vaesenc 48(%rdx),%xmm1,%xmm1
+ vaesenc 64(%rdx),%xmm1,%xmm1
+ vaesenc 80(%rdx),%xmm1,%xmm1
+ vaesenc 96(%rdx),%xmm1,%xmm1
+ vaesenc 112(%rdx),%xmm1,%xmm1
+ vaesenc 128(%rdx),%xmm1,%xmm1
+ vaesenc 144(%rdx),%xmm1,%xmm1
+ vaesenclast 160(%rdx),%xmm1,%xmm1
+
+ vmovdqa %xmm1,(%rsi)
+
+ ret
+
+
+.globl _aes256gcmsiv_aes_ks_enc_x1
+.private_extern _aes256gcmsiv_aes_ks_enc_x1
+
+.p2align 4
+_aes256gcmsiv_aes_ks_enc_x1:
+
+_CET_ENDBR
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+ vmovdqa (%rdi),%xmm8
+ vmovdqa (%rcx),%xmm1
+ vmovdqa 16(%rcx),%xmm3
+ vpxor %xmm1,%xmm8,%xmm8
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm1,(%rdx)
+ vmovdqu %xmm3,16(%rdx)
+ vpxor %xmm14,%xmm14,%xmm14
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,32(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,48(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,64(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,80(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,96(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,112(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,128(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,144(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,160(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,176(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,192(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,208(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenclast %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,224(%rdx)
+
+ vmovdqa %xmm8,(%rsi)
+ ret
+
+
+.globl _aes256gcmsiv_ecb_enc_block
+.private_extern _aes256gcmsiv_ecb_enc_block
+
+.p2align 4
+_aes256gcmsiv_ecb_enc_block:
+
+_CET_ENDBR
+ vmovdqa (%rdi),%xmm1
+ vpxor (%rdx),%xmm1,%xmm1
+ vaesenc 16(%rdx),%xmm1,%xmm1
+ vaesenc 32(%rdx),%xmm1,%xmm1
+ vaesenc 48(%rdx),%xmm1,%xmm1
+ vaesenc 64(%rdx),%xmm1,%xmm1
+ vaesenc 80(%rdx),%xmm1,%xmm1
+ vaesenc 96(%rdx),%xmm1,%xmm1
+ vaesenc 112(%rdx),%xmm1,%xmm1
+ vaesenc 128(%rdx),%xmm1,%xmm1
+ vaesenc 144(%rdx),%xmm1,%xmm1
+ vaesenc 160(%rdx),%xmm1,%xmm1
+ vaesenc 176(%rdx),%xmm1,%xmm1
+ vaesenc 192(%rdx),%xmm1,%xmm1
+ vaesenc 208(%rdx),%xmm1,%xmm1
+ vaesenclast 224(%rdx),%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ ret
+
+
+.globl _aes256gcmsiv_enc_msg_x4
+.private_extern _aes256gcmsiv_enc_msg_x4
+
+.p2align 4
+_aes256gcmsiv_enc_msg_x4:
+
+_CET_ENDBR
+ testq %r8,%r8
+ jnz L$256_enc_msg_x4_start
+ ret
+
+L$256_enc_msg_x4_start:
+ movq %r8,%r10
+ shrq $4,%r8
+ shlq $60,%r10
+ jz L$256_enc_msg_x4_start2
+ addq $1,%r8
+
+L$256_enc_msg_x4_start2:
+ movq %r8,%r10
+ shlq $62,%r10
+ shrq $62,%r10
+
+
+ vmovdqa (%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+
+ vmovdqa four(%rip),%xmm4
+ vmovdqa %xmm15,%xmm0
+ vpaddd one(%rip),%xmm15,%xmm1
+ vpaddd two(%rip),%xmm15,%xmm2
+ vpaddd three(%rip),%xmm15,%xmm3
+
+ shrq $2,%r8
+ je L$256_enc_msg_x4_check_remainder
+
+ subq $64,%rsi
+ subq $64,%rdi
+
+L$256_enc_msg_x4_loop1:
+ addq $64,%rsi
+ addq $64,%rdi
+
+ vmovdqa %xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vmovdqa %xmm2,%xmm7
+ vmovdqa %xmm3,%xmm8
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqu 32(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm1,%xmm1
+ vmovdqu 48(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm2,%xmm2
+ vmovdqu 64(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm3,%xmm3
+
+ vmovdqu 80(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 96(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 112(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 128(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 144(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 176(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 192(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 208(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 224(%rcx),%xmm12
+ vaesenclast %xmm12,%xmm5,%xmm5
+ vaesenclast %xmm12,%xmm6,%xmm6
+ vaesenclast %xmm12,%xmm7,%xmm7
+ vaesenclast %xmm12,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm5,%xmm5
+ vpxor 16(%rdi),%xmm6,%xmm6
+ vpxor 32(%rdi),%xmm7,%xmm7
+ vpxor 48(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm5,0(%rsi)
+ vmovdqu %xmm6,16(%rsi)
+ vmovdqu %xmm7,32(%rsi)
+ vmovdqu %xmm8,48(%rsi)
+
+ jne L$256_enc_msg_x4_loop1
+
+ addq $64,%rsi
+ addq $64,%rdi
+
+L$256_enc_msg_x4_check_remainder:
+ cmpq $0,%r10
+ je L$256_enc_msg_x4_out
+
+L$256_enc_msg_x4_loop2:
+
+
+
+ vmovdqa %xmm0,%xmm5
+ vpaddd one(%rip),%xmm0,%xmm0
+ vpxor (%rcx),%xmm5,%xmm5
+ vaesenc 16(%rcx),%xmm5,%xmm5
+ vaesenc 32(%rcx),%xmm5,%xmm5
+ vaesenc 48(%rcx),%xmm5,%xmm5
+ vaesenc 64(%rcx),%xmm5,%xmm5
+ vaesenc 80(%rcx),%xmm5,%xmm5
+ vaesenc 96(%rcx),%xmm5,%xmm5
+ vaesenc 112(%rcx),%xmm5,%xmm5
+ vaesenc 128(%rcx),%xmm5,%xmm5
+ vaesenc 144(%rcx),%xmm5,%xmm5
+ vaesenc 160(%rcx),%xmm5,%xmm5
+ vaesenc 176(%rcx),%xmm5,%xmm5
+ vaesenc 192(%rcx),%xmm5,%xmm5
+ vaesenc 208(%rcx),%xmm5,%xmm5
+ vaesenclast 224(%rcx),%xmm5,%xmm5
+
+
+ vpxor (%rdi),%xmm5,%xmm5
+
+ vmovdqu %xmm5,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ subq $1,%r10
+ jne L$256_enc_msg_x4_loop2
+
+L$256_enc_msg_x4_out:
+ ret
+
+
+.globl _aes256gcmsiv_enc_msg_x8
+.private_extern _aes256gcmsiv_enc_msg_x8
+
+.p2align 4
+_aes256gcmsiv_enc_msg_x8:
+
+_CET_ENDBR
+ testq %r8,%r8
+ jnz L$256_enc_msg_x8_start
+ ret
+
+L$256_enc_msg_x8_start:
+
+ movq %rsp,%r11
+ subq $16,%r11
+ andq $-64,%r11
+
+ movq %r8,%r10
+ shrq $4,%r8
+ shlq $60,%r10
+ jz L$256_enc_msg_x8_start2
+ addq $1,%r8
+
+L$256_enc_msg_x8_start2:
+ movq %r8,%r10
+ shlq $61,%r10
+ shrq $61,%r10
+
+
+ vmovdqa (%rdx),%xmm1
+ vpor OR_MASK(%rip),%xmm1,%xmm1
+
+
+ vpaddd seven(%rip),%xmm1,%xmm0
+ vmovdqa %xmm0,(%r11)
+ vpaddd one(%rip),%xmm1,%xmm9
+ vpaddd two(%rip),%xmm1,%xmm10
+ vpaddd three(%rip),%xmm1,%xmm11
+ vpaddd four(%rip),%xmm1,%xmm12
+ vpaddd five(%rip),%xmm1,%xmm13
+ vpaddd six(%rip),%xmm1,%xmm14
+ vmovdqa %xmm1,%xmm0
+
+ shrq $3,%r8
+ jz L$256_enc_msg_x8_check_remainder
+
+ subq $128,%rsi
+ subq $128,%rdi
+
+L$256_enc_msg_x8_loop1:
+ addq $128,%rsi
+ addq $128,%rdi
+
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm9,%xmm2
+ vmovdqa %xmm10,%xmm3
+ vmovdqa %xmm11,%xmm4
+ vmovdqa %xmm12,%xmm5
+ vmovdqa %xmm13,%xmm6
+ vmovdqa %xmm14,%xmm7
+
+ vmovdqa (%r11),%xmm8
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vpxor (%rcx),%xmm2,%xmm2
+ vpxor (%rcx),%xmm3,%xmm3
+ vpxor (%rcx),%xmm4,%xmm4
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqa (%r11),%xmm14
+ vpaddd eight(%rip),%xmm14,%xmm14
+ vmovdqa %xmm14,(%r11)
+ vmovdqu 32(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpsubd one(%rip),%xmm14,%xmm14
+ vmovdqu 48(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm0,%xmm0
+ vmovdqu 64(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm9,%xmm9
+ vmovdqu 80(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm10,%xmm10
+ vmovdqu 96(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm11,%xmm11
+ vmovdqu 112(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm12,%xmm12
+ vmovdqu 128(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm13,%xmm13
+ vmovdqu 144(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 176(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 192(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 208(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 224(%rcx),%xmm15
+ vaesenclast %xmm15,%xmm1,%xmm1
+ vaesenclast %xmm15,%xmm2,%xmm2
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vaesenclast %xmm15,%xmm6,%xmm6
+ vaesenclast %xmm15,%xmm7,%xmm7
+ vaesenclast %xmm15,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm1,%xmm1
+ vpxor 16(%rdi),%xmm2,%xmm2
+ vpxor 32(%rdi),%xmm3,%xmm3
+ vpxor 48(%rdi),%xmm4,%xmm4
+ vpxor 64(%rdi),%xmm5,%xmm5
+ vpxor 80(%rdi),%xmm6,%xmm6
+ vpxor 96(%rdi),%xmm7,%xmm7
+ vpxor 112(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm1,0(%rsi)
+ vmovdqu %xmm2,16(%rsi)
+ vmovdqu %xmm3,32(%rsi)
+ vmovdqu %xmm4,48(%rsi)
+ vmovdqu %xmm5,64(%rsi)
+ vmovdqu %xmm6,80(%rsi)
+ vmovdqu %xmm7,96(%rsi)
+ vmovdqu %xmm8,112(%rsi)
+
+ jne L$256_enc_msg_x8_loop1
+
+ addq $128,%rsi
+ addq $128,%rdi
+
+L$256_enc_msg_x8_check_remainder:
+ cmpq $0,%r10
+ je L$256_enc_msg_x8_out
+
+L$256_enc_msg_x8_loop2:
+
+
+ vmovdqa %xmm0,%xmm1
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vaesenc 16(%rcx),%xmm1,%xmm1
+ vaesenc 32(%rcx),%xmm1,%xmm1
+ vaesenc 48(%rcx),%xmm1,%xmm1
+ vaesenc 64(%rcx),%xmm1,%xmm1
+ vaesenc 80(%rcx),%xmm1,%xmm1
+ vaesenc 96(%rcx),%xmm1,%xmm1
+ vaesenc 112(%rcx),%xmm1,%xmm1
+ vaesenc 128(%rcx),%xmm1,%xmm1
+ vaesenc 144(%rcx),%xmm1,%xmm1
+ vaesenc 160(%rcx),%xmm1,%xmm1
+ vaesenc 176(%rcx),%xmm1,%xmm1
+ vaesenc 192(%rcx),%xmm1,%xmm1
+ vaesenc 208(%rcx),%xmm1,%xmm1
+ vaesenclast 224(%rcx),%xmm1,%xmm1
+
+
+ vpxor (%rdi),%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+ subq $1,%r10
+ jnz L$256_enc_msg_x8_loop2
+
+L$256_enc_msg_x8_out:
+ ret
+
+
+
+.globl _aes256gcmsiv_dec
+.private_extern _aes256gcmsiv_dec
+
+.p2align 4
+_aes256gcmsiv_dec:
+
+_CET_ENDBR
+ testq $~15,%r9
+ jnz L$256_dec_start
+ ret
+
+L$256_dec_start:
+ vzeroupper
+ vmovdqa (%rdx),%xmm0
+
+
+ vmovdqu 16(%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+ movq %rdx,%rax
+
+ leaq 32(%rax),%rax
+ leaq 32(%rcx),%rcx
+
+ andq $~15,%r9
+
+
+ cmpq $96,%r9
+ jb L$256_dec_loop2
+
+
+ subq $96,%r9
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vpxor (%r8),%xmm7,%xmm7
+ vpxor (%r8),%xmm8,%xmm8
+ vpxor (%r8),%xmm9,%xmm9
+ vpxor (%r8),%xmm10,%xmm10
+ vpxor (%r8),%xmm11,%xmm11
+ vpxor (%r8),%xmm12,%xmm12
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 176(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 192(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 208(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 224(%r8),%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+
+ vpxor 0(%rdi),%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm12,%xmm12
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ addq $96,%rdi
+ addq $96,%rsi
+ jmp L$256_dec_loop1
+
+
+.p2align 6
+L$256_dec_loop1:
+ cmpq $96,%r9
+ jb L$256_dec_finish_96
+ subq $96,%r9
+
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vmovdqa (%r8),%xmm4
+ vpxor %xmm4,%xmm7,%xmm7
+ vpxor %xmm4,%xmm8,%xmm8
+ vpxor %xmm4,%xmm9,%xmm9
+ vpxor %xmm4,%xmm10,%xmm10
+ vpxor %xmm4,%xmm11,%xmm11
+ vpxor %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vmovdqa 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 176(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 192(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 208(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 224(%r8),%xmm6
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor 0(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+ leaq 96(%rdi),%rdi
+ leaq 96(%rsi),%rsi
+ jmp L$256_dec_loop1
+
+L$256_dec_finish_96:
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+L$256_dec_loop2:
+
+
+
+ cmpq $16,%r9
+ jb L$256_dec_out
+ subq $16,%r9
+
+ vmovdqa %xmm15,%xmm2
+ vpaddd one(%rip),%xmm15,%xmm15
+
+ vpxor 0(%r8),%xmm2,%xmm2
+ vaesenc 16(%r8),%xmm2,%xmm2
+ vaesenc 32(%r8),%xmm2,%xmm2
+ vaesenc 48(%r8),%xmm2,%xmm2
+ vaesenc 64(%r8),%xmm2,%xmm2
+ vaesenc 80(%r8),%xmm2,%xmm2
+ vaesenc 96(%r8),%xmm2,%xmm2
+ vaesenc 112(%r8),%xmm2,%xmm2
+ vaesenc 128(%r8),%xmm2,%xmm2
+ vaesenc 144(%r8),%xmm2,%xmm2
+ vaesenc 160(%r8),%xmm2,%xmm2
+ vaesenc 176(%r8),%xmm2,%xmm2
+ vaesenc 192(%r8),%xmm2,%xmm2
+ vaesenc 208(%r8),%xmm2,%xmm2
+ vaesenclast 224(%r8),%xmm2,%xmm2
+ vpxor (%rdi),%xmm2,%xmm2
+ vmovdqu %xmm2,(%rsi)
+ addq $16,%rdi
+ addq $16,%rsi
+
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa -32(%rcx),%xmm1
+ call GFMUL
+
+ jmp L$256_dec_loop2
+
+L$256_dec_out:
+ vmovdqu %xmm0,(%rdx)
+ ret
+
+
+.globl _aes256gcmsiv_kdf
+.private_extern _aes256gcmsiv_kdf
+
+.p2align 4
+_aes256gcmsiv_kdf:
+
+_CET_ENDBR
+
+
+
+
+ vmovdqa (%rdx),%xmm1
+ vmovdqa 0(%rdi),%xmm4
+ vmovdqa and_mask(%rip),%xmm11
+ vmovdqa one(%rip),%xmm8
+ vpshufd $0x90,%xmm4,%xmm4
+ vpand %xmm11,%xmm4,%xmm4
+ vpaddd %xmm8,%xmm4,%xmm6
+ vpaddd %xmm8,%xmm6,%xmm7
+ vpaddd %xmm8,%xmm7,%xmm11
+ vpaddd %xmm8,%xmm11,%xmm12
+ vpaddd %xmm8,%xmm12,%xmm13
+
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm1,%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm1,%xmm11,%xmm11
+ vpxor %xmm1,%xmm12,%xmm12
+ vpxor %xmm1,%xmm13,%xmm13
+
+ vmovdqa 16(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 32(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 48(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 64(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 80(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 96(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 112(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 128(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 144(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 160(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 176(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 192(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 208(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 224(%rdx),%xmm2
+ vaesenclast %xmm2,%xmm4,%xmm4
+ vaesenclast %xmm2,%xmm6,%xmm6
+ vaesenclast %xmm2,%xmm7,%xmm7
+ vaesenclast %xmm2,%xmm11,%xmm11
+ vaesenclast %xmm2,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+
+
+ vmovdqa %xmm4,0(%rsi)
+ vmovdqa %xmm6,16(%rsi)
+ vmovdqa %xmm7,32(%rsi)
+ vmovdqa %xmm11,48(%rsi)
+ vmovdqa %xmm12,64(%rsi)
+ vmovdqa %xmm13,80(%rsi)
+ ret
+
+
+#endif
diff --git a/gen/crypto/aes128gcmsiv-x86_64-linux.S b/gen/crypto/aes128gcmsiv-x86_64-linux.S
new file mode 100644
index 0000000..a8de4a9
--- /dev/null
+++ b/gen/crypto/aes128gcmsiv-x86_64-linux.S
@@ -0,0 +1,3091 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.section .rodata
+
+.align 16
+one:
+.quad 1,0
+two:
+.quad 2,0
+three:
+.quad 3,0
+four:
+.quad 4,0
+five:
+.quad 5,0
+six:
+.quad 6,0
+seven:
+.quad 7,0
+eight:
+.quad 8,0
+
+OR_MASK:
+.long 0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+.quad 0x1, 0xc200000000000000
+mask:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+.long 1,1,1,1
+con2:
+.long 0x1b,0x1b,0x1b,0x1b
+con3:
+.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+.long 0,0xffffffff, 0xffffffff, 0xffffffff
+.text
+.type GFMUL,@function
+.align 16
+GFMUL:
+.cfi_startproc
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm3,%xmm5,%xmm5
+
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
+ vpshufd $78,%xmm2,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
+ vpshufd $78,%xmm2,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+
+ vpxor %xmm5,%xmm2,%xmm0
+ ret
+.cfi_endproc
+.size GFMUL, .-GFMUL
+.globl aesgcmsiv_htable_init
+.hidden aesgcmsiv_htable_init
+.type aesgcmsiv_htable_init,@function
+.align 16
+aesgcmsiv_htable_init:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqa (%rsi),%xmm0
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm0,(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,16(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,32(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,48(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,64(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,80(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,96(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,112(%rdi)
+ ret
+.cfi_endproc
+.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
+.globl aesgcmsiv_htable6_init
+.hidden aesgcmsiv_htable6_init
+.type aesgcmsiv_htable6_init,@function
+.align 16
+aesgcmsiv_htable6_init:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqa (%rsi),%xmm0
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm0,(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,16(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,32(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,48(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,64(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,80(%rdi)
+ ret
+.cfi_endproc
+.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
+.globl aesgcmsiv_htable_polyval
+.hidden aesgcmsiv_htable_polyval
+.type aesgcmsiv_htable_polyval,@function
+.align 16
+aesgcmsiv_htable_polyval:
+.cfi_startproc
+_CET_ENDBR
+ testq %rdx,%rdx
+ jnz .Lhtable_polyval_start
+ ret
+
+.Lhtable_polyval_start:
+ vzeroall
+
+
+
+ movq %rdx,%r11
+ andq $127,%r11
+
+ jz .Lhtable_polyval_no_prefix
+
+ vpxor %xmm9,%xmm9,%xmm9
+ vmovdqa (%rcx),%xmm1
+ subq %r11,%rdx
+
+ subq $16,%r11
+
+
+ vmovdqu (%rsi),%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+ leaq 16(%rsi),%rsi
+ testq %r11,%r11
+ jnz .Lhtable_polyval_prefix_loop
+ jmp .Lhtable_polyval_prefix_complete
+
+
+.align 64
+.Lhtable_polyval_prefix_loop:
+ subq $16,%r11
+
+ vmovdqu (%rsi),%xmm0
+
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+ testq %r11,%r11
+
+ leaq 16(%rsi),%rsi
+
+ jnz .Lhtable_polyval_prefix_loop
+
+.Lhtable_polyval_prefix_complete:
+ vpsrldq $8,%xmm5,%xmm6
+ vpslldq $8,%xmm5,%xmm5
+
+ vpxor %xmm6,%xmm4,%xmm9
+ vpxor %xmm5,%xmm3,%xmm1
+
+ jmp .Lhtable_polyval_main_loop
+
+.Lhtable_polyval_no_prefix:
+
+
+
+
+ vpxor %xmm1,%xmm1,%xmm1
+ vmovdqa (%rcx),%xmm9
+
+.align 64
+.Lhtable_polyval_main_loop:
+ subq $0x80,%rdx
+ jb .Lhtable_polyval_out
+
+ vmovdqu 112(%rsi),%xmm0
+
+ vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5
+ vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3
+ vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4
+ vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 96(%rsi),%xmm0
+ vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+
+ vmovdqu 80(%rsi),%xmm0
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+
+ vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm7,%xmm1,%xmm1
+
+ vmovdqu 64(%rsi),%xmm0
+
+ vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 48(%rsi),%xmm0
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+
+ vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm7,%xmm1,%xmm1
+
+ vmovdqu 32(%rsi),%xmm0
+
+ vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vmovdqu 16(%rsi),%xmm0
+
+ vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 0(%rsi),%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpsrldq $8,%xmm5,%xmm6
+ vpslldq $8,%xmm5,%xmm5
+
+ vpxor %xmm6,%xmm4,%xmm9
+ vpxor %xmm5,%xmm3,%xmm1
+
+ leaq 128(%rsi),%rsi
+ jmp .Lhtable_polyval_main_loop
+
+
+
+.Lhtable_polyval_out:
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+ vpxor %xmm6,%xmm1,%xmm1
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+ vpxor %xmm6,%xmm1,%xmm1
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rcx)
+ vzeroupper
+ ret
+.cfi_endproc
+.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
+.globl aesgcmsiv_polyval_horner
+.hidden aesgcmsiv_polyval_horner
+.type aesgcmsiv_polyval_horner,@function
+.align 16
+aesgcmsiv_polyval_horner:
+.cfi_startproc
+_CET_ENDBR
+ testq %rcx,%rcx
+ jnz .Lpolyval_horner_start
+ ret
+
+.Lpolyval_horner_start:
+
+
+
+ xorq %r10,%r10
+ shlq $4,%rcx
+
+ vmovdqa (%rsi),%xmm1
+ vmovdqa (%rdi),%xmm0
+
+.Lpolyval_horner_loop:
+ vpxor (%rdx,%r10,1),%xmm0,%xmm0
+ call GFMUL
+
+ addq $16,%r10
+ cmpq %r10,%rcx
+ jne .Lpolyval_horner_loop
+
+
+ vmovdqa %xmm0,(%rdi)
+ ret
+.cfi_endproc
+.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
+.globl aes128gcmsiv_aes_ks
+.hidden aes128gcmsiv_aes_ks
+.type aes128gcmsiv_aes_ks,@function
+.align 16
+aes128gcmsiv_aes_ks:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqu (%rdi),%xmm1
+ vmovdqa %xmm1,(%rsi)
+
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+
+ movq $8,%rax
+
+.Lks128_loop:
+ addq $16,%rsi
+ subq $1,%rax
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ jne .Lks128_loop
+
+ vmovdqa con2(%rip),%xmm0
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,16(%rsi)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,32(%rsi)
+ ret
+.cfi_endproc
+.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
+.globl aes256gcmsiv_aes_ks
+.hidden aes256gcmsiv_aes_ks
+.type aes256gcmsiv_aes_ks,@function
+.align 16
+aes256gcmsiv_aes_ks:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqu (%rdi),%xmm1
+ vmovdqu 16(%rdi),%xmm3
+ vmovdqa %xmm1,(%rsi)
+ vmovdqa %xmm3,16(%rsi)
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+ vpxor %xmm14,%xmm14,%xmm14
+ movq $6,%rax
+
+.Lks256_loop:
+ addq $32,%rsi
+ subq $1,%rax
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpshufb con3(%rip),%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vmovdqa %xmm3,16(%rsi)
+ jne .Lks256_loop
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpsllq $32,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,32(%rsi)
+ ret
+.cfi_endproc
+.globl aes128gcmsiv_aes_ks_enc_x1
+.hidden aes128gcmsiv_aes_ks_enc_x1
+.type aes128gcmsiv_aes_ks_enc_x1,@function
+.align 16
+aes128gcmsiv_aes_ks_enc_x1:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqa (%rcx),%xmm1
+ vmovdqa 0(%rdi),%xmm4
+
+ vmovdqa %xmm1,(%rdx)
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,16(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,32(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,48(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,64(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,80(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,96(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,112(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,128(%rdx)
+
+
+ vmovdqa con2(%rip),%xmm0
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,144(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenclast %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,160(%rdx)
+
+
+ vmovdqa %xmm4,0(%rsi)
+ ret
+.cfi_endproc
+.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
+.globl aes128gcmsiv_kdf
+.hidden aes128gcmsiv_kdf
+.type aes128gcmsiv_kdf,@function
+.align 16
+aes128gcmsiv_kdf:
+.cfi_startproc
+_CET_ENDBR
+
+
+
+
+ vmovdqa (%rdx),%xmm1
+ vmovdqa 0(%rdi),%xmm9
+ vmovdqa and_mask(%rip),%xmm12
+ vmovdqa one(%rip),%xmm13
+ vpshufd $0x90,%xmm9,%xmm9
+ vpand %xmm12,%xmm9,%xmm9
+ vpaddd %xmm13,%xmm9,%xmm10
+ vpaddd %xmm13,%xmm10,%xmm11
+ vpaddd %xmm13,%xmm11,%xmm12
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpxor %xmm1,%xmm10,%xmm10
+ vpxor %xmm1,%xmm11,%xmm11
+ vpxor %xmm1,%xmm12,%xmm12
+
+ vmovdqa 16(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 32(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 48(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 64(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 80(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 96(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 112(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 128(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 144(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 160(%rdx),%xmm2
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vaesenclast %xmm2,%xmm10,%xmm10
+ vaesenclast %xmm2,%xmm11,%xmm11
+ vaesenclast %xmm2,%xmm12,%xmm12
+
+
+ vmovdqa %xmm9,0(%rsi)
+ vmovdqa %xmm10,16(%rsi)
+ vmovdqa %xmm11,32(%rsi)
+ vmovdqa %xmm12,48(%rsi)
+ ret
+.cfi_endproc
+.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
+.globl aes128gcmsiv_enc_msg_x4
+.hidden aes128gcmsiv_enc_msg_x4
+.type aes128gcmsiv_enc_msg_x4,@function
+.align 16
+aes128gcmsiv_enc_msg_x4:
+.cfi_startproc
+_CET_ENDBR
+ testq %r8,%r8
+ jnz .L128_enc_msg_x4_start
+ ret
+
+.L128_enc_msg_x4_start:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+
+ shrq $4,%r8
+ movq %r8,%r10
+ shlq $62,%r10
+ shrq $62,%r10
+
+
+ vmovdqa (%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+
+ vmovdqu four(%rip),%xmm4
+ vmovdqa %xmm15,%xmm0
+ vpaddd one(%rip),%xmm15,%xmm1
+ vpaddd two(%rip),%xmm15,%xmm2
+ vpaddd three(%rip),%xmm15,%xmm3
+
+ shrq $2,%r8
+ je .L128_enc_msg_x4_check_remainder
+
+ subq $64,%rsi
+ subq $64,%rdi
+
+.L128_enc_msg_x4_loop1:
+ addq $64,%rsi
+ addq $64,%rdi
+
+ vmovdqa %xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vmovdqa %xmm2,%xmm7
+ vmovdqa %xmm3,%xmm8
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqu 32(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm1,%xmm1
+ vmovdqu 48(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm2,%xmm2
+ vmovdqu 64(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm3,%xmm3
+
+ vmovdqu 80(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 96(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 112(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 128(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 144(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm12
+ vaesenclast %xmm12,%xmm5,%xmm5
+ vaesenclast %xmm12,%xmm6,%xmm6
+ vaesenclast %xmm12,%xmm7,%xmm7
+ vaesenclast %xmm12,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm5,%xmm5
+ vpxor 16(%rdi),%xmm6,%xmm6
+ vpxor 32(%rdi),%xmm7,%xmm7
+ vpxor 48(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm5,0(%rsi)
+ vmovdqu %xmm6,16(%rsi)
+ vmovdqu %xmm7,32(%rsi)
+ vmovdqu %xmm8,48(%rsi)
+
+ jne .L128_enc_msg_x4_loop1
+
+ addq $64,%rsi
+ addq $64,%rdi
+
+.L128_enc_msg_x4_check_remainder:
+ cmpq $0,%r10
+ je .L128_enc_msg_x4_out
+
+.L128_enc_msg_x4_loop2:
+
+
+ vmovdqa %xmm0,%xmm5
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vaesenc 16(%rcx),%xmm5,%xmm5
+ vaesenc 32(%rcx),%xmm5,%xmm5
+ vaesenc 48(%rcx),%xmm5,%xmm5
+ vaesenc 64(%rcx),%xmm5,%xmm5
+ vaesenc 80(%rcx),%xmm5,%xmm5
+ vaesenc 96(%rcx),%xmm5,%xmm5
+ vaesenc 112(%rcx),%xmm5,%xmm5
+ vaesenc 128(%rcx),%xmm5,%xmm5
+ vaesenc 144(%rcx),%xmm5,%xmm5
+ vaesenclast 160(%rcx),%xmm5,%xmm5
+
+
+ vpxor (%rdi),%xmm5,%xmm5
+ vmovdqu %xmm5,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ subq $1,%r10
+ jne .L128_enc_msg_x4_loop2
+
+.L128_enc_msg_x4_out:
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+.cfi_endproc
+.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
+.globl aes128gcmsiv_enc_msg_x8
+.hidden aes128gcmsiv_enc_msg_x8
+.type aes128gcmsiv_enc_msg_x8,@function
+.align 16
+aes128gcmsiv_enc_msg_x8:
+.cfi_startproc
+_CET_ENDBR
+ testq %r8,%r8
+ jnz .L128_enc_msg_x8_start
+ ret
+
+.L128_enc_msg_x8_start:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-32
+ movq %rsp,%rbp
+.cfi_def_cfa_register rbp
+
+
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shrq $4,%r8
+ movq %r8,%r10
+ shlq $61,%r10
+ shrq $61,%r10
+
+
+ vmovdqu (%rdx),%xmm1
+ vpor OR_MASK(%rip),%xmm1,%xmm1
+
+
+ vpaddd seven(%rip),%xmm1,%xmm0
+ vmovdqu %xmm0,(%rsp)
+ vpaddd one(%rip),%xmm1,%xmm9
+ vpaddd two(%rip),%xmm1,%xmm10
+ vpaddd three(%rip),%xmm1,%xmm11
+ vpaddd four(%rip),%xmm1,%xmm12
+ vpaddd five(%rip),%xmm1,%xmm13
+ vpaddd six(%rip),%xmm1,%xmm14
+ vmovdqa %xmm1,%xmm0
+
+ shrq $3,%r8
+ je .L128_enc_msg_x8_check_remainder
+
+ subq $128,%rsi
+ subq $128,%rdi
+
+.L128_enc_msg_x8_loop1:
+ addq $128,%rsi
+ addq $128,%rdi
+
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm9,%xmm2
+ vmovdqa %xmm10,%xmm3
+ vmovdqa %xmm11,%xmm4
+ vmovdqa %xmm12,%xmm5
+ vmovdqa %xmm13,%xmm6
+ vmovdqa %xmm14,%xmm7
+
+ vmovdqu (%rsp),%xmm8
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vpxor (%rcx),%xmm2,%xmm2
+ vpxor (%rcx),%xmm3,%xmm3
+ vpxor (%rcx),%xmm4,%xmm4
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu (%rsp),%xmm14
+ vpaddd eight(%rip),%xmm14,%xmm14
+ vmovdqu %xmm14,(%rsp)
+ vmovdqu 32(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpsubd one(%rip),%xmm14,%xmm14
+ vmovdqu 48(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm0,%xmm0
+ vmovdqu 64(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm9,%xmm9
+ vmovdqu 80(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm10,%xmm10
+ vmovdqu 96(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm11,%xmm11
+ vmovdqu 112(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm12,%xmm12
+ vmovdqu 128(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm13,%xmm13
+ vmovdqu 144(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm15
+ vaesenclast %xmm15,%xmm1,%xmm1
+ vaesenclast %xmm15,%xmm2,%xmm2
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vaesenclast %xmm15,%xmm6,%xmm6
+ vaesenclast %xmm15,%xmm7,%xmm7
+ vaesenclast %xmm15,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm1,%xmm1
+ vpxor 16(%rdi),%xmm2,%xmm2
+ vpxor 32(%rdi),%xmm3,%xmm3
+ vpxor 48(%rdi),%xmm4,%xmm4
+ vpxor 64(%rdi),%xmm5,%xmm5
+ vpxor 80(%rdi),%xmm6,%xmm6
+ vpxor 96(%rdi),%xmm7,%xmm7
+ vpxor 112(%rdi),%xmm8,%xmm8
+
+ decq %r8
+
+ vmovdqu %xmm1,0(%rsi)
+ vmovdqu %xmm2,16(%rsi)
+ vmovdqu %xmm3,32(%rsi)
+ vmovdqu %xmm4,48(%rsi)
+ vmovdqu %xmm5,64(%rsi)
+ vmovdqu %xmm6,80(%rsi)
+ vmovdqu %xmm7,96(%rsi)
+ vmovdqu %xmm8,112(%rsi)
+
+ jne .L128_enc_msg_x8_loop1
+
+ addq $128,%rsi
+ addq $128,%rdi
+
+.L128_enc_msg_x8_check_remainder:
+ cmpq $0,%r10
+ je .L128_enc_msg_x8_out
+
+.L128_enc_msg_x8_loop2:
+
+
+ vmovdqa %xmm0,%xmm1
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vaesenc 16(%rcx),%xmm1,%xmm1
+ vaesenc 32(%rcx),%xmm1,%xmm1
+ vaesenc 48(%rcx),%xmm1,%xmm1
+ vaesenc 64(%rcx),%xmm1,%xmm1
+ vaesenc 80(%rcx),%xmm1,%xmm1
+ vaesenc 96(%rcx),%xmm1,%xmm1
+ vaesenc 112(%rcx),%xmm1,%xmm1
+ vaesenc 128(%rcx),%xmm1,%xmm1
+ vaesenc 144(%rcx),%xmm1,%xmm1
+ vaesenclast 160(%rcx),%xmm1,%xmm1
+
+
+ vpxor (%rdi),%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ decq %r10
+ jne .L128_enc_msg_x8_loop2
+
+.L128_enc_msg_x8_out:
+ movq %rbp,%rsp
+.cfi_def_cfa_register %rsp
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+.cfi_endproc
+.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
+.globl aes128gcmsiv_dec
+.hidden aes128gcmsiv_dec
+.type aes128gcmsiv_dec,@function
+.align 16
+aes128gcmsiv_dec:
+.cfi_startproc
+_CET_ENDBR
+ testq $~15,%r9
+ jnz .L128_dec_start
+ ret
+
+.L128_dec_start:
+ vzeroupper
+ vmovdqa (%rdx),%xmm0
+
+
+ vmovdqu 16(%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+ movq %rdx,%rax
+
+ leaq 32(%rax),%rax
+ leaq 32(%rcx),%rcx
+
+ andq $~15,%r9
+
+
+ cmpq $96,%r9
+ jb .L128_dec_loop2
+
+
+ subq $96,%r9
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vpxor (%r8),%xmm7,%xmm7
+ vpxor (%r8),%xmm8,%xmm8
+ vpxor (%r8),%xmm9,%xmm9
+ vpxor (%r8),%xmm10,%xmm10
+ vpxor (%r8),%xmm11,%xmm11
+ vpxor (%r8),%xmm12,%xmm12
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+
+ vpxor 0(%rdi),%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm12,%xmm12
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ addq $96,%rdi
+ addq $96,%rsi
+ jmp .L128_dec_loop1
+
+
+.align 64
+.L128_dec_loop1:
+ cmpq $96,%r9
+ jb .L128_dec_finish_96
+ subq $96,%r9
+
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vmovdqa (%r8),%xmm4
+ vpxor %xmm4,%xmm7,%xmm7
+ vpxor %xmm4,%xmm8,%xmm8
+ vpxor %xmm4,%xmm9,%xmm9
+ vpxor %xmm4,%xmm10,%xmm10
+ vpxor %xmm4,%xmm11,%xmm11
+ vpxor %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vmovdqa 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm6
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor 0(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+ leaq 96(%rdi),%rdi
+ leaq 96(%rsi),%rsi
+ jmp .L128_dec_loop1
+
+.L128_dec_finish_96:
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+.L128_dec_loop2:
+
+
+
+ cmpq $16,%r9
+ jb .L128_dec_out
+ subq $16,%r9
+
+ vmovdqa %xmm15,%xmm2
+ vpaddd one(%rip),%xmm15,%xmm15
+
+ vpxor 0(%r8),%xmm2,%xmm2
+ vaesenc 16(%r8),%xmm2,%xmm2
+ vaesenc 32(%r8),%xmm2,%xmm2
+ vaesenc 48(%r8),%xmm2,%xmm2
+ vaesenc 64(%r8),%xmm2,%xmm2
+ vaesenc 80(%r8),%xmm2,%xmm2
+ vaesenc 96(%r8),%xmm2,%xmm2
+ vaesenc 112(%r8),%xmm2,%xmm2
+ vaesenc 128(%r8),%xmm2,%xmm2
+ vaesenc 144(%r8),%xmm2,%xmm2
+ vaesenclast 160(%r8),%xmm2,%xmm2
+ vpxor (%rdi),%xmm2,%xmm2
+ vmovdqu %xmm2,(%rsi)
+ addq $16,%rdi
+ addq $16,%rsi
+
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa -32(%rcx),%xmm1
+ call GFMUL
+
+ jmp .L128_dec_loop2
+
+.L128_dec_out:
+ vmovdqu %xmm0,(%rdx)
+ ret
+.cfi_endproc
+.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
+.globl aes128gcmsiv_ecb_enc_block
+.hidden aes128gcmsiv_ecb_enc_block
+.type aes128gcmsiv_ecb_enc_block,@function
+.align 16
+aes128gcmsiv_ecb_enc_block:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqa (%rdi),%xmm1
+
+ vpxor (%rdx),%xmm1,%xmm1
+ vaesenc 16(%rdx),%xmm1,%xmm1
+ vaesenc 32(%rdx),%xmm1,%xmm1
+ vaesenc 48(%rdx),%xmm1,%xmm1
+ vaesenc 64(%rdx),%xmm1,%xmm1
+ vaesenc 80(%rdx),%xmm1,%xmm1
+ vaesenc 96(%rdx),%xmm1,%xmm1
+ vaesenc 112(%rdx),%xmm1,%xmm1
+ vaesenc 128(%rdx),%xmm1,%xmm1
+ vaesenc 144(%rdx),%xmm1,%xmm1
+ vaesenclast 160(%rdx),%xmm1,%xmm1
+
+ vmovdqa %xmm1,(%rsi)
+
+ ret
+.cfi_endproc
+.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
+.globl aes256gcmsiv_aes_ks_enc_x1
+.hidden aes256gcmsiv_aes_ks_enc_x1
+.type aes256gcmsiv_aes_ks_enc_x1,@function
+.align 16
+aes256gcmsiv_aes_ks_enc_x1:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+ vmovdqa (%rdi),%xmm8
+ vmovdqa (%rcx),%xmm1
+ vmovdqa 16(%rcx),%xmm3
+ vpxor %xmm1,%xmm8,%xmm8
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm1,(%rdx)
+ vmovdqu %xmm3,16(%rdx)
+ vpxor %xmm14,%xmm14,%xmm14
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,32(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,48(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,64(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,80(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,96(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,112(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,128(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,144(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,160(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,176(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,192(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,208(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenclast %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,224(%rdx)
+
+ vmovdqa %xmm8,(%rsi)
+ ret
+.cfi_endproc
+.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
+.globl aes256gcmsiv_ecb_enc_block
+.hidden aes256gcmsiv_ecb_enc_block
+.type aes256gcmsiv_ecb_enc_block,@function
+.align 16
+aes256gcmsiv_ecb_enc_block:
+.cfi_startproc
+_CET_ENDBR
+ vmovdqa (%rdi),%xmm1
+ vpxor (%rdx),%xmm1,%xmm1
+ vaesenc 16(%rdx),%xmm1,%xmm1
+ vaesenc 32(%rdx),%xmm1,%xmm1
+ vaesenc 48(%rdx),%xmm1,%xmm1
+ vaesenc 64(%rdx),%xmm1,%xmm1
+ vaesenc 80(%rdx),%xmm1,%xmm1
+ vaesenc 96(%rdx),%xmm1,%xmm1
+ vaesenc 112(%rdx),%xmm1,%xmm1
+ vaesenc 128(%rdx),%xmm1,%xmm1
+ vaesenc 144(%rdx),%xmm1,%xmm1
+ vaesenc 160(%rdx),%xmm1,%xmm1
+ vaesenc 176(%rdx),%xmm1,%xmm1
+ vaesenc 192(%rdx),%xmm1,%xmm1
+ vaesenc 208(%rdx),%xmm1,%xmm1
+ vaesenclast 224(%rdx),%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ ret
+.cfi_endproc
+.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
+.globl aes256gcmsiv_enc_msg_x4
+.hidden aes256gcmsiv_enc_msg_x4
+.type aes256gcmsiv_enc_msg_x4,@function
+.align 16
+aes256gcmsiv_enc_msg_x4:
+.cfi_startproc
+_CET_ENDBR
+ testq %r8,%r8
+ jnz .L256_enc_msg_x4_start
+ ret
+
+.L256_enc_msg_x4_start:
+ movq %r8,%r10
+ shrq $4,%r8
+ shlq $60,%r10
+ jz .L256_enc_msg_x4_start2
+ addq $1,%r8
+
+.L256_enc_msg_x4_start2:
+ movq %r8,%r10
+ shlq $62,%r10
+ shrq $62,%r10
+
+
+ vmovdqa (%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+
+ vmovdqa four(%rip),%xmm4
+ vmovdqa %xmm15,%xmm0
+ vpaddd one(%rip),%xmm15,%xmm1
+ vpaddd two(%rip),%xmm15,%xmm2
+ vpaddd three(%rip),%xmm15,%xmm3
+
+ shrq $2,%r8
+ je .L256_enc_msg_x4_check_remainder
+
+ subq $64,%rsi
+ subq $64,%rdi
+
+.L256_enc_msg_x4_loop1:
+ addq $64,%rsi
+ addq $64,%rdi
+
+ vmovdqa %xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vmovdqa %xmm2,%xmm7
+ vmovdqa %xmm3,%xmm8
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqu 32(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm1,%xmm1
+ vmovdqu 48(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm2,%xmm2
+ vmovdqu 64(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm3,%xmm3
+
+ vmovdqu 80(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 96(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 112(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 128(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 144(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 176(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 192(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 208(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 224(%rcx),%xmm12
+ vaesenclast %xmm12,%xmm5,%xmm5
+ vaesenclast %xmm12,%xmm6,%xmm6
+ vaesenclast %xmm12,%xmm7,%xmm7
+ vaesenclast %xmm12,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm5,%xmm5
+ vpxor 16(%rdi),%xmm6,%xmm6
+ vpxor 32(%rdi),%xmm7,%xmm7
+ vpxor 48(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm5,0(%rsi)
+ vmovdqu %xmm6,16(%rsi)
+ vmovdqu %xmm7,32(%rsi)
+ vmovdqu %xmm8,48(%rsi)
+
+ jne .L256_enc_msg_x4_loop1
+
+ addq $64,%rsi
+ addq $64,%rdi
+
+.L256_enc_msg_x4_check_remainder:
+ cmpq $0,%r10
+ je .L256_enc_msg_x4_out
+
+.L256_enc_msg_x4_loop2:
+
+
+
+ vmovdqa %xmm0,%xmm5
+ vpaddd one(%rip),%xmm0,%xmm0
+ vpxor (%rcx),%xmm5,%xmm5
+ vaesenc 16(%rcx),%xmm5,%xmm5
+ vaesenc 32(%rcx),%xmm5,%xmm5
+ vaesenc 48(%rcx),%xmm5,%xmm5
+ vaesenc 64(%rcx),%xmm5,%xmm5
+ vaesenc 80(%rcx),%xmm5,%xmm5
+ vaesenc 96(%rcx),%xmm5,%xmm5
+ vaesenc 112(%rcx),%xmm5,%xmm5
+ vaesenc 128(%rcx),%xmm5,%xmm5
+ vaesenc 144(%rcx),%xmm5,%xmm5
+ vaesenc 160(%rcx),%xmm5,%xmm5
+ vaesenc 176(%rcx),%xmm5,%xmm5
+ vaesenc 192(%rcx),%xmm5,%xmm5
+ vaesenc 208(%rcx),%xmm5,%xmm5
+ vaesenclast 224(%rcx),%xmm5,%xmm5
+
+
+ vpxor (%rdi),%xmm5,%xmm5
+
+ vmovdqu %xmm5,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ subq $1,%r10
+ jne .L256_enc_msg_x4_loop2
+
+.L256_enc_msg_x4_out:
+ ret
+.cfi_endproc
+.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
+.globl aes256gcmsiv_enc_msg_x8
+.hidden aes256gcmsiv_enc_msg_x8
+.type aes256gcmsiv_enc_msg_x8,@function
+.align 16
+aes256gcmsiv_enc_msg_x8:
+.cfi_startproc
+_CET_ENDBR
+ testq %r8,%r8
+ jnz .L256_enc_msg_x8_start
+ ret
+
+.L256_enc_msg_x8_start:
+
+ movq %rsp,%r11
+ subq $16,%r11
+ andq $-64,%r11
+
+ movq %r8,%r10
+ shrq $4,%r8
+ shlq $60,%r10
+ jz .L256_enc_msg_x8_start2
+ addq $1,%r8
+
+.L256_enc_msg_x8_start2:
+ movq %r8,%r10
+ shlq $61,%r10
+ shrq $61,%r10
+
+
+ vmovdqa (%rdx),%xmm1
+ vpor OR_MASK(%rip),%xmm1,%xmm1
+
+
+ vpaddd seven(%rip),%xmm1,%xmm0
+ vmovdqa %xmm0,(%r11)
+ vpaddd one(%rip),%xmm1,%xmm9
+ vpaddd two(%rip),%xmm1,%xmm10
+ vpaddd three(%rip),%xmm1,%xmm11
+ vpaddd four(%rip),%xmm1,%xmm12
+ vpaddd five(%rip),%xmm1,%xmm13
+ vpaddd six(%rip),%xmm1,%xmm14
+ vmovdqa %xmm1,%xmm0
+
+ shrq $3,%r8
+ jz .L256_enc_msg_x8_check_remainder
+
+ subq $128,%rsi
+ subq $128,%rdi
+
+.L256_enc_msg_x8_loop1:
+ addq $128,%rsi
+ addq $128,%rdi
+
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm9,%xmm2
+ vmovdqa %xmm10,%xmm3
+ vmovdqa %xmm11,%xmm4
+ vmovdqa %xmm12,%xmm5
+ vmovdqa %xmm13,%xmm6
+ vmovdqa %xmm14,%xmm7
+
+ vmovdqa (%r11),%xmm8
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vpxor (%rcx),%xmm2,%xmm2
+ vpxor (%rcx),%xmm3,%xmm3
+ vpxor (%rcx),%xmm4,%xmm4
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqa (%r11),%xmm14
+ vpaddd eight(%rip),%xmm14,%xmm14
+ vmovdqa %xmm14,(%r11)
+ vmovdqu 32(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpsubd one(%rip),%xmm14,%xmm14
+ vmovdqu 48(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm0,%xmm0
+ vmovdqu 64(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm9,%xmm9
+ vmovdqu 80(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm10,%xmm10
+ vmovdqu 96(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm11,%xmm11
+ vmovdqu 112(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm12,%xmm12
+ vmovdqu 128(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm13,%xmm13
+ vmovdqu 144(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 176(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 192(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 208(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 224(%rcx),%xmm15
+ vaesenclast %xmm15,%xmm1,%xmm1
+ vaesenclast %xmm15,%xmm2,%xmm2
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vaesenclast %xmm15,%xmm6,%xmm6
+ vaesenclast %xmm15,%xmm7,%xmm7
+ vaesenclast %xmm15,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm1,%xmm1
+ vpxor 16(%rdi),%xmm2,%xmm2
+ vpxor 32(%rdi),%xmm3,%xmm3
+ vpxor 48(%rdi),%xmm4,%xmm4
+ vpxor 64(%rdi),%xmm5,%xmm5
+ vpxor 80(%rdi),%xmm6,%xmm6
+ vpxor 96(%rdi),%xmm7,%xmm7
+ vpxor 112(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm1,0(%rsi)
+ vmovdqu %xmm2,16(%rsi)
+ vmovdqu %xmm3,32(%rsi)
+ vmovdqu %xmm4,48(%rsi)
+ vmovdqu %xmm5,64(%rsi)
+ vmovdqu %xmm6,80(%rsi)
+ vmovdqu %xmm7,96(%rsi)
+ vmovdqu %xmm8,112(%rsi)
+
+ jne .L256_enc_msg_x8_loop1
+
+ addq $128,%rsi
+ addq $128,%rdi
+
+.L256_enc_msg_x8_check_remainder:
+ cmpq $0,%r10
+ je .L256_enc_msg_x8_out
+
+.L256_enc_msg_x8_loop2:
+
+
+ vmovdqa %xmm0,%xmm1
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vaesenc 16(%rcx),%xmm1,%xmm1
+ vaesenc 32(%rcx),%xmm1,%xmm1
+ vaesenc 48(%rcx),%xmm1,%xmm1
+ vaesenc 64(%rcx),%xmm1,%xmm1
+ vaesenc 80(%rcx),%xmm1,%xmm1
+ vaesenc 96(%rcx),%xmm1,%xmm1
+ vaesenc 112(%rcx),%xmm1,%xmm1
+ vaesenc 128(%rcx),%xmm1,%xmm1
+ vaesenc 144(%rcx),%xmm1,%xmm1
+ vaesenc 160(%rcx),%xmm1,%xmm1
+ vaesenc 176(%rcx),%xmm1,%xmm1
+ vaesenc 192(%rcx),%xmm1,%xmm1
+ vaesenc 208(%rcx),%xmm1,%xmm1
+ vaesenclast 224(%rcx),%xmm1,%xmm1
+
+
+ vpxor (%rdi),%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+ subq $1,%r10
+ jnz .L256_enc_msg_x8_loop2
+
+.L256_enc_msg_x8_out:
+ ret
+
+.cfi_endproc
+.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
+.globl aes256gcmsiv_dec
+.hidden aes256gcmsiv_dec
+.type aes256gcmsiv_dec,@function
+.align 16
+aes256gcmsiv_dec:
+.cfi_startproc
+_CET_ENDBR
+ testq $~15,%r9
+ jnz .L256_dec_start
+ ret
+
+.L256_dec_start:
+ vzeroupper
+ vmovdqa (%rdx),%xmm0
+
+
+ vmovdqu 16(%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+ movq %rdx,%rax
+
+ leaq 32(%rax),%rax
+ leaq 32(%rcx),%rcx
+
+ andq $~15,%r9
+
+
+ cmpq $96,%r9
+ jb .L256_dec_loop2
+
+
+ subq $96,%r9
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vpxor (%r8),%xmm7,%xmm7
+ vpxor (%r8),%xmm8,%xmm8
+ vpxor (%r8),%xmm9,%xmm9
+ vpxor (%r8),%xmm10,%xmm10
+ vpxor (%r8),%xmm11,%xmm11
+ vpxor (%r8),%xmm12,%xmm12
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 176(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 192(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 208(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 224(%r8),%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+
+ vpxor 0(%rdi),%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm12,%xmm12
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ addq $96,%rdi
+ addq $96,%rsi
+ jmp .L256_dec_loop1
+
+
+.align 64
+.L256_dec_loop1:
+ cmpq $96,%r9
+ jb .L256_dec_finish_96
+ subq $96,%r9
+
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vmovdqa (%r8),%xmm4
+ vpxor %xmm4,%xmm7,%xmm7
+ vpxor %xmm4,%xmm8,%xmm8
+ vpxor %xmm4,%xmm9,%xmm9
+ vpxor %xmm4,%xmm10,%xmm10
+ vpxor %xmm4,%xmm11,%xmm11
+ vpxor %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vmovdqa 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 176(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 192(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 208(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 224(%r8),%xmm6
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor 0(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+ leaq 96(%rdi),%rdi
+ leaq 96(%rsi),%rsi
+ jmp .L256_dec_loop1
+
+.L256_dec_finish_96:
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+.L256_dec_loop2:
+
+
+
+ cmpq $16,%r9
+ jb .L256_dec_out
+ subq $16,%r9
+
+ vmovdqa %xmm15,%xmm2
+ vpaddd one(%rip),%xmm15,%xmm15
+
+ vpxor 0(%r8),%xmm2,%xmm2
+ vaesenc 16(%r8),%xmm2,%xmm2
+ vaesenc 32(%r8),%xmm2,%xmm2
+ vaesenc 48(%r8),%xmm2,%xmm2
+ vaesenc 64(%r8),%xmm2,%xmm2
+ vaesenc 80(%r8),%xmm2,%xmm2
+ vaesenc 96(%r8),%xmm2,%xmm2
+ vaesenc 112(%r8),%xmm2,%xmm2
+ vaesenc 128(%r8),%xmm2,%xmm2
+ vaesenc 144(%r8),%xmm2,%xmm2
+ vaesenc 160(%r8),%xmm2,%xmm2
+ vaesenc 176(%r8),%xmm2,%xmm2
+ vaesenc 192(%r8),%xmm2,%xmm2
+ vaesenc 208(%r8),%xmm2,%xmm2
+ vaesenclast 224(%r8),%xmm2,%xmm2
+ vpxor (%rdi),%xmm2,%xmm2
+ vmovdqu %xmm2,(%rsi)
+ addq $16,%rdi
+ addq $16,%rsi
+
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa -32(%rcx),%xmm1
+ call GFMUL
+
+ jmp .L256_dec_loop2
+
+.L256_dec_out:
+ vmovdqu %xmm0,(%rdx)
+ ret
+.cfi_endproc
+.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
+.globl aes256gcmsiv_kdf
+.hidden aes256gcmsiv_kdf
+.type aes256gcmsiv_kdf,@function
+.align 16
+aes256gcmsiv_kdf:
+.cfi_startproc
+_CET_ENDBR
+
+
+
+
+ vmovdqa (%rdx),%xmm1
+ vmovdqa 0(%rdi),%xmm4
+ vmovdqa and_mask(%rip),%xmm11
+ vmovdqa one(%rip),%xmm8
+ vpshufd $0x90,%xmm4,%xmm4
+ vpand %xmm11,%xmm4,%xmm4
+ vpaddd %xmm8,%xmm4,%xmm6
+ vpaddd %xmm8,%xmm6,%xmm7
+ vpaddd %xmm8,%xmm7,%xmm11
+ vpaddd %xmm8,%xmm11,%xmm12
+ vpaddd %xmm8,%xmm12,%xmm13
+
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm1,%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm1,%xmm11,%xmm11
+ vpxor %xmm1,%xmm12,%xmm12
+ vpxor %xmm1,%xmm13,%xmm13
+
+ vmovdqa 16(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 32(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 48(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 64(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 80(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 96(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 112(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 128(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 144(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 160(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 176(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 192(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 208(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 224(%rdx),%xmm2
+ vaesenclast %xmm2,%xmm4,%xmm4
+ vaesenclast %xmm2,%xmm6,%xmm6
+ vaesenclast %xmm2,%xmm7,%xmm7
+ vaesenclast %xmm2,%xmm11,%xmm11
+ vaesenclast %xmm2,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+
+
+ vmovdqa %xmm4,0(%rsi)
+ vmovdqa %xmm6,16(%rsi)
+ vmovdqa %xmm7,32(%rsi)
+ vmovdqa %xmm11,48(%rsi)
+ vmovdqa %xmm12,64(%rsi)
+ vmovdqa %xmm13,80(%rsi)
+ ret
+.cfi_endproc
+.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
+#endif
diff --git a/gen/crypto/aes128gcmsiv-x86_64-win.asm b/gen/crypto/aes128gcmsiv-x86_64-win.asm
new file mode 100644
index 0000000..6691a2d
--- /dev/null
+++ b/gen/crypto/aes128gcmsiv-x86_64-win.asm
@@ -0,0 +1,3302 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .rdata rdata align=8
+
+ALIGN 16
+one:
+ DQ 1,0
+two:
+ DQ 2,0
+three:
+ DQ 3,0
+four:
+ DQ 4,0
+five:
+ DQ 5,0
+six:
+ DQ 6,0
+seven:
+ DQ 7,0
+eight:
+ DQ 8,0
+
+OR_MASK:
+ DD 0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+ DQ 0x1,0xc200000000000000
+mask:
+ DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+ DD 1,1,1,1
+con2:
+ DD 0x1b,0x1b,0x1b,0x1b
+con3:
+ DB -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+ DD 0,0xffffffff,0xffffffff,0xffffffff
+section .text code align=64
+
+
+ALIGN 16
+GFMUL:
+
+ vpclmulqdq xmm2,xmm0,xmm1,0x00
+ vpclmulqdq xmm5,xmm0,xmm1,0x11
+ vpclmulqdq xmm3,xmm0,xmm1,0x10
+ vpclmulqdq xmm4,xmm0,xmm1,0x01
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm3,8
+ vpsrldq xmm3,xmm3,8
+ vpxor xmm2,xmm2,xmm4
+ vpxor xmm5,xmm5,xmm3
+
+ vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10
+ vpshufd xmm4,xmm2,78
+ vpxor xmm2,xmm3,xmm4
+
+ vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10
+ vpshufd xmm4,xmm2,78
+ vpxor xmm2,xmm3,xmm4
+
+ vpxor xmm0,xmm2,xmm5
+ ret
+
+
+global aesgcmsiv_htable_init
+
+ALIGN 16
+aesgcmsiv_htable_init:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_htable_init:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+_CET_ENDBR
+ vmovdqa xmm0,XMMWORD[rsi]
+ vmovdqa xmm1,xmm0
+ vmovdqa XMMWORD[rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[16+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[32+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[48+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[64+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[80+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[96+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[112+rdi],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aesgcmsiv_htable_init:
+global aesgcmsiv_htable6_init
+
+ALIGN 16
+aesgcmsiv_htable6_init:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_htable6_init:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+_CET_ENDBR
+ vmovdqa xmm0,XMMWORD[rsi]
+ vmovdqa xmm1,xmm0
+ vmovdqa XMMWORD[rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[16+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[32+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[48+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[64+rdi],xmm0
+ call GFMUL
+ vmovdqa XMMWORD[80+rdi],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aesgcmsiv_htable6_init:
+global aesgcmsiv_htable_polyval
+
+ALIGN 16
+aesgcmsiv_htable_polyval:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_htable_polyval:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+_CET_ENDBR
+ test rdx,rdx
+ jnz NEAR $L$htable_polyval_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$htable_polyval_start:
+ vzeroall
+
+
+
+ mov r11,rdx
+ and r11,127
+
+ jz NEAR $L$htable_polyval_no_prefix
+
+ vpxor xmm9,xmm9,xmm9
+ vmovdqa xmm1,XMMWORD[rcx]
+ sub rdx,r11
+
+ sub r11,16
+
+
+ vmovdqu xmm0,XMMWORD[rsi]
+ vpxor xmm0,xmm0,xmm1
+
+ vpclmulqdq xmm5,xmm0,XMMWORD[r11*1+rdi],0x01
+ vpclmulqdq xmm3,xmm0,XMMWORD[r11*1+rdi],0x00
+ vpclmulqdq xmm4,xmm0,XMMWORD[r11*1+rdi],0x11
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+ lea rsi,[16+rsi]
+ test r11,r11
+ jnz NEAR $L$htable_polyval_prefix_loop
+ jmp NEAR $L$htable_polyval_prefix_complete
+
+
+ALIGN 64
+$L$htable_polyval_prefix_loop:
+ sub r11,16
+
+ vmovdqu xmm0,XMMWORD[rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+ test r11,r11
+
+ lea rsi,[16+rsi]
+
+ jnz NEAR $L$htable_polyval_prefix_loop
+
+$L$htable_polyval_prefix_complete:
+ vpsrldq xmm6,xmm5,8
+ vpslldq xmm5,xmm5,8
+
+ vpxor xmm9,xmm4,xmm6
+ vpxor xmm1,xmm3,xmm5
+
+ jmp NEAR $L$htable_polyval_main_loop
+
+$L$htable_polyval_no_prefix:
+
+
+
+
+ vpxor xmm1,xmm1,xmm1
+ vmovdqa xmm9,XMMWORD[rcx]
+
+ALIGN 64
+$L$htable_polyval_main_loop:
+ sub rdx,0x80
+ jb NEAR $L$htable_polyval_out
+
+ vmovdqu xmm0,XMMWORD[112+rsi]
+
+ vpclmulqdq xmm5,xmm0,XMMWORD[rdi],0x01
+ vpclmulqdq xmm3,xmm0,XMMWORD[rdi],0x00
+ vpclmulqdq xmm4,xmm0,XMMWORD[rdi],0x11
+ vpclmulqdq xmm6,xmm0,XMMWORD[rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vmovdqu xmm0,XMMWORD[96+rsi]
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+
+ vmovdqu xmm0,XMMWORD[80+rsi]
+
+ vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpxor xmm1,xmm1,xmm7
+
+ vmovdqu xmm0,XMMWORD[64+rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vmovdqu xmm0,XMMWORD[48+rsi]
+
+ vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpxor xmm1,xmm1,xmm7
+
+ vmovdqu xmm0,XMMWORD[32+rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpxor xmm1,xmm1,xmm9
+
+ vmovdqu xmm0,XMMWORD[16+rsi]
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vmovdqu xmm0,XMMWORD[rsi]
+ vpxor xmm0,xmm0,xmm1
+
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x01
+ vpxor xmm5,xmm5,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x00
+ vpxor xmm3,xmm3,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x11
+ vpxor xmm4,xmm4,xmm6
+ vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x10
+ vpxor xmm5,xmm5,xmm6
+
+
+ vpsrldq xmm6,xmm5,8
+ vpslldq xmm5,xmm5,8
+
+ vpxor xmm9,xmm4,xmm6
+ vpxor xmm1,xmm3,xmm5
+
+ lea rsi,[128+rsi]
+ jmp NEAR $L$htable_polyval_main_loop
+
+
+
+$L$htable_polyval_out:
+ vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+ vpxor xmm1,xmm1,xmm6
+
+ vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10
+ vpalignr xmm1,xmm1,xmm1,8
+ vpxor xmm1,xmm1,xmm6
+ vpxor xmm1,xmm1,xmm9
+
+ vmovdqu XMMWORD[rcx],xmm1
+ vzeroupper
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aesgcmsiv_htable_polyval:
+global aesgcmsiv_polyval_horner
+
+ALIGN 16
+aesgcmsiv_polyval_horner:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesgcmsiv_polyval_horner:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+_CET_ENDBR
+ test rcx,rcx
+ jnz NEAR $L$polyval_horner_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$polyval_horner_start:
+
+
+
+ xor r10,r10
+ shl rcx,4
+
+ vmovdqa xmm1,XMMWORD[rsi]
+ vmovdqa xmm0,XMMWORD[rdi]
+
+$L$polyval_horner_loop:
+ vpxor xmm0,xmm0,XMMWORD[r10*1+rdx]
+ call GFMUL
+
+ add r10,16
+ cmp rcx,r10
+ jne NEAR $L$polyval_horner_loop
+
+
+ vmovdqa XMMWORD[rdi],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aesgcmsiv_polyval_horner:
+global aes128gcmsiv_aes_ks
+
+ALIGN 16
+aes128gcmsiv_aes_ks:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_aes_ks:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+_CET_ENDBR
+ vmovdqu xmm1,XMMWORD[rdi]
+ vmovdqa XMMWORD[rsi],xmm1
+
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+
+ mov rax,8
+
+$L$ks128_loop:
+ add rsi,16
+ sub rax,1
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm3,xmm1,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[rsi],xmm1
+ jne NEAR $L$ks128_loop
+
+ vmovdqa xmm0,XMMWORD[con2]
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm3,xmm1,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[16+rsi],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslldq xmm3,xmm1,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpslldq xmm3,xmm3,4
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[32+rsi],xmm1
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes128gcmsiv_aes_ks:
+global aes256gcmsiv_aes_ks
+
+ALIGN 16
+aes256gcmsiv_aes_ks:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_aes_ks:
+ mov rdi,rcx
+ mov rsi,rdx
+
+
+
+_CET_ENDBR
+ vmovdqu xmm1,XMMWORD[rdi]
+ vmovdqu xmm3,XMMWORD[16+rdi]
+ vmovdqa XMMWORD[rsi],xmm1
+ vmovdqa XMMWORD[16+rsi],xmm3
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+ vpxor xmm14,xmm14,xmm14
+ mov rax,6
+
+$L$ks256_loop:
+ add rsi,32
+ sub rax,1
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm4,xmm1,32
+ vpxor xmm1,xmm1,xmm4
+ vpshufb xmm4,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[rsi],xmm1
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpsllq xmm4,xmm3,32
+ vpxor xmm3,xmm3,xmm4
+ vpshufb xmm4,xmm3,XMMWORD[con3]
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vmovdqa XMMWORD[16+rsi],xmm3
+ jne NEAR $L$ks256_loop
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpsllq xmm4,xmm1,32
+ vpxor xmm1,xmm1,xmm4
+ vpshufb xmm4,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vmovdqa XMMWORD[32+rsi],xmm1
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+global aes128gcmsiv_aes_ks_enc_x1
+
+ALIGN 16
+aes128gcmsiv_aes_ks_enc_x1:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+_CET_ENDBR
+ vmovdqa xmm1,XMMWORD[rcx]
+ vmovdqa xmm4,XMMWORD[rdi]
+
+ vmovdqa XMMWORD[rdx],xmm1
+ vpxor xmm4,xmm4,xmm1
+
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[16+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[32+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[48+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[64+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[80+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[96+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[112+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[128+rdx],xmm1
+
+
+ vmovdqa xmm0,XMMWORD[con2]
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenc xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[144+rdx],xmm1
+
+ vpshufb xmm2,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpsllq xmm3,xmm1,32
+ vpxor xmm1,xmm1,xmm3
+ vpshufb xmm3,xmm1,XMMWORD[con3]
+ vpxor xmm1,xmm1,xmm3
+ vpxor xmm1,xmm1,xmm2
+
+ vaesenclast xmm4,xmm4,xmm1
+ vmovdqa XMMWORD[160+rdx],xmm1
+
+
+ vmovdqa XMMWORD[rsi],xmm4
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1:
+global aes128gcmsiv_kdf
+
+ALIGN 16
+aes128gcmsiv_kdf:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_kdf:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+
+
+
+
+ vmovdqa xmm1,XMMWORD[rdx]
+ vmovdqa xmm9,XMMWORD[rdi]
+ vmovdqa xmm12,XMMWORD[and_mask]
+ vmovdqa xmm13,XMMWORD[one]
+ vpshufd xmm9,xmm9,0x90
+ vpand xmm9,xmm9,xmm12
+ vpaddd xmm10,xmm9,xmm13
+ vpaddd xmm11,xmm10,xmm13
+ vpaddd xmm12,xmm11,xmm13
+
+ vpxor xmm9,xmm9,xmm1
+ vpxor xmm10,xmm10,xmm1
+ vpxor xmm11,xmm11,xmm1
+ vpxor xmm12,xmm12,xmm1
+
+ vmovdqa xmm1,XMMWORD[16+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[32+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[48+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[64+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[80+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[96+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[112+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[128+rdx]
+ vaesenc xmm9,xmm9,xmm2
+ vaesenc xmm10,xmm10,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+
+ vmovdqa xmm1,XMMWORD[144+rdx]
+ vaesenc xmm9,xmm9,xmm1
+ vaesenc xmm10,xmm10,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+
+ vmovdqa xmm2,XMMWORD[160+rdx]
+ vaesenclast xmm9,xmm9,xmm2
+ vaesenclast xmm10,xmm10,xmm2
+ vaesenclast xmm11,xmm11,xmm2
+ vaesenclast xmm12,xmm12,xmm2
+
+
+ vmovdqa XMMWORD[rsi],xmm9
+ vmovdqa XMMWORD[16+rsi],xmm10
+ vmovdqa XMMWORD[32+rsi],xmm11
+ vmovdqa XMMWORD[48+rsi],xmm12
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes128gcmsiv_kdf:
+global aes128gcmsiv_enc_msg_x4
+
+ALIGN 16
+aes128gcmsiv_enc_msg_x4:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_enc_msg_x4:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ test r8,r8
+ jnz NEAR $L$128_enc_msg_x4_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$128_enc_msg_x4_start:
+ push r12
+
+ push r13
+
+
+ shr r8,4
+ mov r10,r8
+ shl r10,62
+ shr r10,62
+
+
+ vmovdqa xmm15,XMMWORD[rdx]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+
+ vmovdqu xmm4,XMMWORD[four]
+ vmovdqa xmm0,xmm15
+ vpaddd xmm1,xmm15,XMMWORD[one]
+ vpaddd xmm2,xmm15,XMMWORD[two]
+ vpaddd xmm3,xmm15,XMMWORD[three]
+
+ shr r8,2
+ je NEAR $L$128_enc_msg_x4_check_remainder
+
+ sub rsi,64
+ sub rdi,64
+
+$L$128_enc_msg_x4_loop1:
+ add rsi,64
+ add rdi,64
+
+ vmovdqa xmm5,xmm0
+ vmovdqa xmm6,xmm1
+ vmovdqa xmm7,xmm2
+ vmovdqa xmm8,xmm3
+
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm12,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm0,xmm0,xmm4
+ vmovdqu xmm12,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm1,xmm1,xmm4
+ vmovdqu xmm12,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm2,xmm2,xmm4
+ vmovdqu xmm12,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm3,xmm3,xmm4
+
+ vmovdqu xmm12,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[144+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[160+rcx]
+ vaesenclast xmm5,xmm5,xmm12
+ vaesenclast xmm6,xmm6,xmm12
+ vaesenclast xmm7,xmm7,xmm12
+ vaesenclast xmm8,xmm8,xmm12
+
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+ vpxor xmm6,xmm6,XMMWORD[16+rdi]
+ vpxor xmm7,xmm7,XMMWORD[32+rdi]
+ vpxor xmm8,xmm8,XMMWORD[48+rdi]
+
+ sub r8,1
+
+ vmovdqu XMMWORD[rsi],xmm5
+ vmovdqu XMMWORD[16+rsi],xmm6
+ vmovdqu XMMWORD[32+rsi],xmm7
+ vmovdqu XMMWORD[48+rsi],xmm8
+
+ jne NEAR $L$128_enc_msg_x4_loop1
+
+ add rsi,64
+ add rdi,64
+
+$L$128_enc_msg_x4_check_remainder:
+ cmp r10,0
+ je NEAR $L$128_enc_msg_x4_out
+
+$L$128_enc_msg_x4_loop2:
+
+
+ vmovdqa xmm5,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vaesenc xmm5,xmm5,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[144+rcx]
+ vaesenclast xmm5,xmm5,XMMWORD[160+rcx]
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+ vmovdqu XMMWORD[rsi],xmm5
+
+ add rdi,16
+ add rsi,16
+
+ sub r10,1
+ jne NEAR $L$128_enc_msg_x4_loop2
+
+$L$128_enc_msg_x4_out:
+ pop r13
+
+ pop r12
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes128gcmsiv_enc_msg_x4:
+global aes128gcmsiv_enc_msg_x8
+
+ALIGN 16
+aes128gcmsiv_enc_msg_x8:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_enc_msg_x8:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ test r8,r8
+ jnz NEAR $L$128_enc_msg_x8_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$128_enc_msg_x8_start:
+ push r12
+
+ push r13
+
+ push rbp
+
+ mov rbp,rsp
+
+
+
+ sub rsp,128
+ and rsp,-64
+
+ shr r8,4
+ mov r10,r8
+ shl r10,61
+ shr r10,61
+
+
+ vmovdqu xmm1,XMMWORD[rdx]
+ vpor xmm1,xmm1,XMMWORD[OR_MASK]
+
+
+ vpaddd xmm0,xmm1,XMMWORD[seven]
+ vmovdqu XMMWORD[rsp],xmm0
+ vpaddd xmm9,xmm1,XMMWORD[one]
+ vpaddd xmm10,xmm1,XMMWORD[two]
+ vpaddd xmm11,xmm1,XMMWORD[three]
+ vpaddd xmm12,xmm1,XMMWORD[four]
+ vpaddd xmm13,xmm1,XMMWORD[five]
+ vpaddd xmm14,xmm1,XMMWORD[six]
+ vmovdqa xmm0,xmm1
+
+ shr r8,3
+ je NEAR $L$128_enc_msg_x8_check_remainder
+
+ sub rsi,128
+ sub rdi,128
+
+$L$128_enc_msg_x8_loop1:
+ add rsi,128
+ add rdi,128
+
+ vmovdqa xmm1,xmm0
+ vmovdqa xmm2,xmm9
+ vmovdqa xmm3,xmm10
+ vmovdqa xmm4,xmm11
+ vmovdqa xmm5,xmm12
+ vmovdqa xmm6,xmm13
+ vmovdqa xmm7,xmm14
+
+ vmovdqu xmm8,XMMWORD[rsp]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vpxor xmm2,xmm2,XMMWORD[rcx]
+ vpxor xmm3,xmm3,XMMWORD[rcx]
+ vpxor xmm4,xmm4,XMMWORD[rcx]
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm15,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm14,XMMWORD[rsp]
+ vpaddd xmm14,xmm14,XMMWORD[eight]
+ vmovdqu XMMWORD[rsp],xmm14
+ vmovdqu xmm15,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpsubd xmm14,xmm14,XMMWORD[one]
+ vmovdqu xmm15,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm0,xmm0,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm9,xmm9,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm10,xmm10,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm11,xmm11,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm12,xmm12,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm13,xmm13,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[144+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[160+rcx]
+ vaesenclast xmm1,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm15
+ vaesenclast xmm3,xmm3,xmm15
+ vaesenclast xmm4,xmm4,xmm15
+ vaesenclast xmm5,xmm5,xmm15
+ vaesenclast xmm6,xmm6,xmm15
+ vaesenclast xmm7,xmm7,xmm15
+ vaesenclast xmm8,xmm8,xmm15
+
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+ vpxor xmm2,xmm2,XMMWORD[16+rdi]
+ vpxor xmm3,xmm3,XMMWORD[32+rdi]
+ vpxor xmm4,xmm4,XMMWORD[48+rdi]
+ vpxor xmm5,xmm5,XMMWORD[64+rdi]
+ vpxor xmm6,xmm6,XMMWORD[80+rdi]
+ vpxor xmm7,xmm7,XMMWORD[96+rdi]
+ vpxor xmm8,xmm8,XMMWORD[112+rdi]
+
+ dec r8
+
+ vmovdqu XMMWORD[rsi],xmm1
+ vmovdqu XMMWORD[16+rsi],xmm2
+ vmovdqu XMMWORD[32+rsi],xmm3
+ vmovdqu XMMWORD[48+rsi],xmm4
+ vmovdqu XMMWORD[64+rsi],xmm5
+ vmovdqu XMMWORD[80+rsi],xmm6
+ vmovdqu XMMWORD[96+rsi],xmm7
+ vmovdqu XMMWORD[112+rsi],xmm8
+
+ jne NEAR $L$128_enc_msg_x8_loop1
+
+ add rsi,128
+ add rdi,128
+
+$L$128_enc_msg_x8_check_remainder:
+ cmp r10,0
+ je NEAR $L$128_enc_msg_x8_out
+
+$L$128_enc_msg_x8_loop2:
+
+
+ vmovdqa xmm1,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rcx]
+ vaesenclast xmm1,xmm1,XMMWORD[160+rcx]
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+
+ vmovdqu XMMWORD[rsi],xmm1
+
+ add rdi,16
+ add rsi,16
+
+ dec r10
+ jne NEAR $L$128_enc_msg_x8_loop2
+
+$L$128_enc_msg_x8_out:
+ mov rsp,rbp
+
+ pop rbp
+
+ pop r13
+
+ pop r12
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes128gcmsiv_enc_msg_x8:
+global aes128gcmsiv_dec
+
+ALIGN 16
+aes128gcmsiv_dec:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_dec:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ test r9,~15
+ jnz NEAR $L$128_dec_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$128_dec_start:
+ vzeroupper
+ vmovdqa xmm0,XMMWORD[rdx]
+
+
+ vmovdqu xmm15,XMMWORD[16+rdx]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+ mov rax,rdx
+
+ lea rax,[32+rax]
+ lea rcx,[32+rcx]
+
+ and r9,~15
+
+
+ cmp r9,96
+ jb NEAR $L$128_dec_loop2
+
+
+ sub r9,96
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vpxor xmm7,xmm7,XMMWORD[r8]
+ vpxor xmm8,xmm8,XMMWORD[r8]
+ vpxor xmm9,xmm9,XMMWORD[r8]
+ vpxor xmm10,xmm10,XMMWORD[r8]
+ vpxor xmm11,xmm11,XMMWORD[r8]
+ vpxor xmm12,xmm12,XMMWORD[r8]
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[160+r8]
+ vaesenclast xmm7,xmm7,xmm4
+ vaesenclast xmm8,xmm8,xmm4
+ vaesenclast xmm9,xmm9,xmm4
+ vaesenclast xmm10,xmm10,xmm4
+ vaesenclast xmm11,xmm11,xmm4
+ vaesenclast xmm12,xmm12,xmm4
+
+
+ vpxor xmm7,xmm7,XMMWORD[rdi]
+ vpxor xmm8,xmm8,XMMWORD[16+rdi]
+ vpxor xmm9,xmm9,XMMWORD[32+rdi]
+ vpxor xmm10,xmm10,XMMWORD[48+rdi]
+ vpxor xmm11,xmm11,XMMWORD[64+rdi]
+ vpxor xmm12,xmm12,XMMWORD[80+rdi]
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ add rdi,96
+ add rsi,96
+ jmp NEAR $L$128_dec_loop1
+
+
+ALIGN 64
+$L$128_dec_loop1:
+ cmp r9,96
+ jb NEAR $L$128_dec_finish_96
+ sub r9,96
+
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vmovdqa xmm4,XMMWORD[r8]
+ vpxor xmm7,xmm7,xmm4
+ vpxor xmm8,xmm8,xmm4
+ vpxor xmm9,xmm9,xmm4
+ vpxor xmm10,xmm10,xmm4
+ vpxor xmm11,xmm11,xmm4
+ vpxor xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm1,xmm6,xmm4,0x01
+ vpclmulqdq xmm4,xmm6,xmm4,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vmovdqa xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[160+r8]
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm4,xmm6,XMMWORD[rdi]
+ vaesenclast xmm7,xmm7,xmm4
+ vpxor xmm4,xmm6,XMMWORD[16+rdi]
+ vaesenclast xmm8,xmm8,xmm4
+ vpxor xmm4,xmm6,XMMWORD[32+rdi]
+ vaesenclast xmm9,xmm9,xmm4
+ vpxor xmm4,xmm6,XMMWORD[48+rdi]
+ vaesenclast xmm10,xmm10,xmm4
+ vpxor xmm4,xmm6,XMMWORD[64+rdi]
+ vaesenclast xmm11,xmm11,xmm4
+ vpxor xmm4,xmm6,XMMWORD[80+rdi]
+ vaesenclast xmm12,xmm12,xmm4
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ vpxor xmm0,xmm0,xmm5
+
+ lea rdi,[96+rdi]
+ lea rsi,[96+rsi]
+ jmp NEAR $L$128_dec_loop1
+
+$L$128_dec_finish_96:
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm1,xmm6,xmm4,0x10
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm4,xmm6,xmm4,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm0,xmm0,xmm5
+
+$L$128_dec_loop2:
+
+
+
+ cmp r9,16
+ jb NEAR $L$128_dec_out
+ sub r9,16
+
+ vmovdqa xmm2,xmm15
+ vpaddd xmm15,xmm15,XMMWORD[one]
+
+ vpxor xmm2,xmm2,XMMWORD[r8]
+ vaesenc xmm2,xmm2,XMMWORD[16+r8]
+ vaesenc xmm2,xmm2,XMMWORD[32+r8]
+ vaesenc xmm2,xmm2,XMMWORD[48+r8]
+ vaesenc xmm2,xmm2,XMMWORD[64+r8]
+ vaesenc xmm2,xmm2,XMMWORD[80+r8]
+ vaesenc xmm2,xmm2,XMMWORD[96+r8]
+ vaesenc xmm2,xmm2,XMMWORD[112+r8]
+ vaesenc xmm2,xmm2,XMMWORD[128+r8]
+ vaesenc xmm2,xmm2,XMMWORD[144+r8]
+ vaesenclast xmm2,xmm2,XMMWORD[160+r8]
+ vpxor xmm2,xmm2,XMMWORD[rdi]
+ vmovdqu XMMWORD[rsi],xmm2
+ add rdi,16
+ add rsi,16
+
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm1,XMMWORD[((-32))+rcx]
+ call GFMUL
+
+ jmp NEAR $L$128_dec_loop2
+
+$L$128_dec_out:
+ vmovdqu XMMWORD[rdx],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes128gcmsiv_dec:
+global aes128gcmsiv_ecb_enc_block
+
+ALIGN 16
+aes128gcmsiv_ecb_enc_block:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes128gcmsiv_ecb_enc_block:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ vmovdqa xmm1,XMMWORD[rdi]
+
+ vpxor xmm1,xmm1,XMMWORD[rdx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rdx]
+ vaesenclast xmm1,xmm1,XMMWORD[160+rdx]
+
+ vmovdqa XMMWORD[rsi],xmm1
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes128gcmsiv_ecb_enc_block:
+global aes256gcmsiv_aes_ks_enc_x1
+
+ALIGN 16
+aes256gcmsiv_aes_ks_enc_x1:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+
+
+
+_CET_ENDBR
+ vmovdqa xmm0,XMMWORD[con1]
+ vmovdqa xmm15,XMMWORD[mask]
+ vmovdqa xmm8,XMMWORD[rdi]
+ vmovdqa xmm1,XMMWORD[rcx]
+ vmovdqa xmm3,XMMWORD[16+rcx]
+ vpxor xmm8,xmm8,xmm1
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[rdx],xmm1
+ vmovdqu XMMWORD[16+rdx],xmm3
+ vpxor xmm14,xmm14,xmm14
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[32+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[48+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[64+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[80+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[96+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[112+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[128+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[144+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[160+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[176+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslld xmm0,xmm0,1
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenc xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[192+rdx],xmm1
+
+ vpshufd xmm2,xmm1,0xff
+ vaesenclast xmm2,xmm2,xmm14
+ vpslldq xmm4,xmm3,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm3,xmm3,xmm4
+ vpxor xmm3,xmm3,xmm2
+ vaesenc xmm8,xmm8,xmm3
+ vmovdqu XMMWORD[208+rdx],xmm3
+
+ vpshufb xmm2,xmm3,xmm15
+ vaesenclast xmm2,xmm2,xmm0
+ vpslldq xmm4,xmm1,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpslldq xmm4,xmm4,4
+ vpxor xmm1,xmm1,xmm4
+ vpxor xmm1,xmm1,xmm2
+ vaesenclast xmm8,xmm8,xmm1
+ vmovdqu XMMWORD[224+rdx],xmm1
+
+ vmovdqa XMMWORD[rsi],xmm8
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1:
+global aes256gcmsiv_ecb_enc_block
+
+ALIGN 16
+aes256gcmsiv_ecb_enc_block:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_ecb_enc_block:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+ vmovdqa xmm1,XMMWORD[rdi]
+ vpxor xmm1,xmm1,XMMWORD[rdx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[160+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[176+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[192+rdx]
+ vaesenc xmm1,xmm1,XMMWORD[208+rdx]
+ vaesenclast xmm1,xmm1,XMMWORD[224+rdx]
+ vmovdqa XMMWORD[rsi],xmm1
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes256gcmsiv_ecb_enc_block:
+global aes256gcmsiv_enc_msg_x4
+
+ALIGN 16
+aes256gcmsiv_enc_msg_x4:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_enc_msg_x4:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ test r8,r8
+ jnz NEAR $L$256_enc_msg_x4_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$256_enc_msg_x4_start:
+ mov r10,r8
+ shr r8,4
+ shl r10,60
+ jz NEAR $L$256_enc_msg_x4_start2
+ add r8,1
+
+$L$256_enc_msg_x4_start2:
+ mov r10,r8
+ shl r10,62
+ shr r10,62
+
+
+ vmovdqa xmm15,XMMWORD[rdx]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+
+ vmovdqa xmm4,XMMWORD[four]
+ vmovdqa xmm0,xmm15
+ vpaddd xmm1,xmm15,XMMWORD[one]
+ vpaddd xmm2,xmm15,XMMWORD[two]
+ vpaddd xmm3,xmm15,XMMWORD[three]
+
+ shr r8,2
+ je NEAR $L$256_enc_msg_x4_check_remainder
+
+ sub rsi,64
+ sub rdi,64
+
+$L$256_enc_msg_x4_loop1:
+ add rsi,64
+ add rdi,64
+
+ vmovdqa xmm5,xmm0
+ vmovdqa xmm6,xmm1
+ vmovdqa xmm7,xmm2
+ vmovdqa xmm8,xmm3
+
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm12,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm0,xmm0,xmm4
+ vmovdqu xmm12,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm1,xmm1,xmm4
+ vmovdqu xmm12,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm2,xmm2,xmm4
+ vmovdqu xmm12,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vpaddd xmm3,xmm3,xmm4
+
+ vmovdqu xmm12,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[144+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[160+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[176+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[192+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[208+rcx]
+ vaesenc xmm5,xmm5,xmm12
+ vaesenc xmm6,xmm6,xmm12
+ vaesenc xmm7,xmm7,xmm12
+ vaesenc xmm8,xmm8,xmm12
+
+ vmovdqu xmm12,XMMWORD[224+rcx]
+ vaesenclast xmm5,xmm5,xmm12
+ vaesenclast xmm6,xmm6,xmm12
+ vaesenclast xmm7,xmm7,xmm12
+ vaesenclast xmm8,xmm8,xmm12
+
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+ vpxor xmm6,xmm6,XMMWORD[16+rdi]
+ vpxor xmm7,xmm7,XMMWORD[32+rdi]
+ vpxor xmm8,xmm8,XMMWORD[48+rdi]
+
+ sub r8,1
+
+ vmovdqu XMMWORD[rsi],xmm5
+ vmovdqu XMMWORD[16+rsi],xmm6
+ vmovdqu XMMWORD[32+rsi],xmm7
+ vmovdqu XMMWORD[48+rsi],xmm8
+
+ jne NEAR $L$256_enc_msg_x4_loop1
+
+ add rsi,64
+ add rdi,64
+
+$L$256_enc_msg_x4_check_remainder:
+ cmp r10,0
+ je NEAR $L$256_enc_msg_x4_out
+
+$L$256_enc_msg_x4_loop2:
+
+
+
+ vmovdqa xmm5,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vaesenc xmm5,xmm5,XMMWORD[16+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[32+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[48+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[64+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[80+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[96+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[112+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[128+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[144+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[160+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[176+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[192+rcx]
+ vaesenc xmm5,xmm5,XMMWORD[208+rcx]
+ vaesenclast xmm5,xmm5,XMMWORD[224+rcx]
+
+
+ vpxor xmm5,xmm5,XMMWORD[rdi]
+
+ vmovdqu XMMWORD[rsi],xmm5
+
+ add rdi,16
+ add rsi,16
+
+ sub r10,1
+ jne NEAR $L$256_enc_msg_x4_loop2
+
+$L$256_enc_msg_x4_out:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes256gcmsiv_enc_msg_x4:
+global aes256gcmsiv_enc_msg_x8
+
+ALIGN 16
+aes256gcmsiv_enc_msg_x8:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_enc_msg_x8:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ test r8,r8
+ jnz NEAR $L$256_enc_msg_x8_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$256_enc_msg_x8_start:
+
+ mov r11,rsp
+ sub r11,16
+ and r11,-64
+
+ mov r10,r8
+ shr r8,4
+ shl r10,60
+ jz NEAR $L$256_enc_msg_x8_start2
+ add r8,1
+
+$L$256_enc_msg_x8_start2:
+ mov r10,r8
+ shl r10,61
+ shr r10,61
+
+
+ vmovdqa xmm1,XMMWORD[rdx]
+ vpor xmm1,xmm1,XMMWORD[OR_MASK]
+
+
+ vpaddd xmm0,xmm1,XMMWORD[seven]
+ vmovdqa XMMWORD[r11],xmm0
+ vpaddd xmm9,xmm1,XMMWORD[one]
+ vpaddd xmm10,xmm1,XMMWORD[two]
+ vpaddd xmm11,xmm1,XMMWORD[three]
+ vpaddd xmm12,xmm1,XMMWORD[four]
+ vpaddd xmm13,xmm1,XMMWORD[five]
+ vpaddd xmm14,xmm1,XMMWORD[six]
+ vmovdqa xmm0,xmm1
+
+ shr r8,3
+ jz NEAR $L$256_enc_msg_x8_check_remainder
+
+ sub rsi,128
+ sub rdi,128
+
+$L$256_enc_msg_x8_loop1:
+ add rsi,128
+ add rdi,128
+
+ vmovdqa xmm1,xmm0
+ vmovdqa xmm2,xmm9
+ vmovdqa xmm3,xmm10
+ vmovdqa xmm4,xmm11
+ vmovdqa xmm5,xmm12
+ vmovdqa xmm6,xmm13
+ vmovdqa xmm7,xmm14
+
+ vmovdqa xmm8,XMMWORD[r11]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vpxor xmm2,xmm2,XMMWORD[rcx]
+ vpxor xmm3,xmm3,XMMWORD[rcx]
+ vpxor xmm4,xmm4,XMMWORD[rcx]
+ vpxor xmm5,xmm5,XMMWORD[rcx]
+ vpxor xmm6,xmm6,XMMWORD[rcx]
+ vpxor xmm7,xmm7,XMMWORD[rcx]
+ vpxor xmm8,xmm8,XMMWORD[rcx]
+
+ vmovdqu xmm15,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqa xmm14,XMMWORD[r11]
+ vpaddd xmm14,xmm14,XMMWORD[eight]
+ vmovdqa XMMWORD[r11],xmm14
+ vmovdqu xmm15,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpsubd xmm14,xmm14,XMMWORD[one]
+ vmovdqu xmm15,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm0,xmm0,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm9,xmm9,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm10,xmm10,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm11,xmm11,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm12,xmm12,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vpaddd xmm13,xmm13,XMMWORD[eight]
+ vmovdqu xmm15,XMMWORD[144+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[160+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[176+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[192+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[208+rcx]
+ vaesenc xmm1,xmm1,xmm15
+ vaesenc xmm2,xmm2,xmm15
+ vaesenc xmm3,xmm3,xmm15
+ vaesenc xmm4,xmm4,xmm15
+ vaesenc xmm5,xmm5,xmm15
+ vaesenc xmm6,xmm6,xmm15
+ vaesenc xmm7,xmm7,xmm15
+ vaesenc xmm8,xmm8,xmm15
+
+ vmovdqu xmm15,XMMWORD[224+rcx]
+ vaesenclast xmm1,xmm1,xmm15
+ vaesenclast xmm2,xmm2,xmm15
+ vaesenclast xmm3,xmm3,xmm15
+ vaesenclast xmm4,xmm4,xmm15
+ vaesenclast xmm5,xmm5,xmm15
+ vaesenclast xmm6,xmm6,xmm15
+ vaesenclast xmm7,xmm7,xmm15
+ vaesenclast xmm8,xmm8,xmm15
+
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+ vpxor xmm2,xmm2,XMMWORD[16+rdi]
+ vpxor xmm3,xmm3,XMMWORD[32+rdi]
+ vpxor xmm4,xmm4,XMMWORD[48+rdi]
+ vpxor xmm5,xmm5,XMMWORD[64+rdi]
+ vpxor xmm6,xmm6,XMMWORD[80+rdi]
+ vpxor xmm7,xmm7,XMMWORD[96+rdi]
+ vpxor xmm8,xmm8,XMMWORD[112+rdi]
+
+ sub r8,1
+
+ vmovdqu XMMWORD[rsi],xmm1
+ vmovdqu XMMWORD[16+rsi],xmm2
+ vmovdqu XMMWORD[32+rsi],xmm3
+ vmovdqu XMMWORD[48+rsi],xmm4
+ vmovdqu XMMWORD[64+rsi],xmm5
+ vmovdqu XMMWORD[80+rsi],xmm6
+ vmovdqu XMMWORD[96+rsi],xmm7
+ vmovdqu XMMWORD[112+rsi],xmm8
+
+ jne NEAR $L$256_enc_msg_x8_loop1
+
+ add rsi,128
+ add rdi,128
+
+$L$256_enc_msg_x8_check_remainder:
+ cmp r10,0
+ je NEAR $L$256_enc_msg_x8_out
+
+$L$256_enc_msg_x8_loop2:
+
+
+ vmovdqa xmm1,xmm0
+ vpaddd xmm0,xmm0,XMMWORD[one]
+
+ vpxor xmm1,xmm1,XMMWORD[rcx]
+ vaesenc xmm1,xmm1,XMMWORD[16+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[32+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[48+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[64+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[80+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[96+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[112+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[128+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[144+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[160+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[176+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[192+rcx]
+ vaesenc xmm1,xmm1,XMMWORD[208+rcx]
+ vaesenclast xmm1,xmm1,XMMWORD[224+rcx]
+
+
+ vpxor xmm1,xmm1,XMMWORD[rdi]
+
+ vmovdqu XMMWORD[rsi],xmm1
+
+ add rdi,16
+ add rsi,16
+ sub r10,1
+ jnz NEAR $L$256_enc_msg_x8_loop2
+
+$L$256_enc_msg_x8_out:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+
+$L$SEH_end_aes256gcmsiv_enc_msg_x8:
+global aes256gcmsiv_dec
+
+ALIGN 16
+aes256gcmsiv_dec:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_dec:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ test r9,~15
+ jnz NEAR $L$256_dec_start
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$256_dec_start:
+ vzeroupper
+ vmovdqa xmm0,XMMWORD[rdx]
+
+
+ vmovdqu xmm15,XMMWORD[16+rdx]
+ vpor xmm15,xmm15,XMMWORD[OR_MASK]
+ mov rax,rdx
+
+ lea rax,[32+rax]
+ lea rcx,[32+rcx]
+
+ and r9,~15
+
+
+ cmp r9,96
+ jb NEAR $L$256_dec_loop2
+
+
+ sub r9,96
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vpxor xmm7,xmm7,XMMWORD[r8]
+ vpxor xmm8,xmm8,XMMWORD[r8]
+ vpxor xmm9,xmm9,XMMWORD[r8]
+ vpxor xmm10,xmm10,XMMWORD[r8]
+ vpxor xmm11,xmm11,XMMWORD[r8]
+ vpxor xmm12,xmm12,XMMWORD[r8]
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[160+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[176+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[192+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[208+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[224+r8]
+ vaesenclast xmm7,xmm7,xmm4
+ vaesenclast xmm8,xmm8,xmm4
+ vaesenclast xmm9,xmm9,xmm4
+ vaesenclast xmm10,xmm10,xmm4
+ vaesenclast xmm11,xmm11,xmm4
+ vaesenclast xmm12,xmm12,xmm4
+
+
+ vpxor xmm7,xmm7,XMMWORD[rdi]
+ vpxor xmm8,xmm8,XMMWORD[16+rdi]
+ vpxor xmm9,xmm9,XMMWORD[32+rdi]
+ vpxor xmm10,xmm10,XMMWORD[48+rdi]
+ vpxor xmm11,xmm11,XMMWORD[64+rdi]
+ vpxor xmm12,xmm12,XMMWORD[80+rdi]
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ add rdi,96
+ add rsi,96
+ jmp NEAR $L$256_dec_loop1
+
+
+ALIGN 64
+$L$256_dec_loop1:
+ cmp r9,96
+ jb NEAR $L$256_dec_finish_96
+ sub r9,96
+
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqa xmm7,xmm15
+ vpaddd xmm8,xmm7,XMMWORD[one]
+ vpaddd xmm9,xmm7,XMMWORD[two]
+ vpaddd xmm10,xmm9,XMMWORD[one]
+ vpaddd xmm11,xmm9,XMMWORD[two]
+ vpaddd xmm12,xmm11,XMMWORD[one]
+ vpaddd xmm15,xmm11,XMMWORD[two]
+
+ vmovdqa xmm4,XMMWORD[r8]
+ vpxor xmm7,xmm7,xmm4
+ vpxor xmm8,xmm8,xmm4
+ vpxor xmm9,xmm9,xmm4
+ vpxor xmm10,xmm10,xmm4
+ vpxor xmm11,xmm11,xmm4
+ vpxor xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm1,xmm6,xmm4,0x01
+ vpclmulqdq xmm4,xmm6,xmm4,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[16+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[32+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[48+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[64+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm4,XMMWORD[80+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[96+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[112+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vmovdqa xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm4,XMMWORD[128+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vmovdqu xmm4,XMMWORD[144+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[160+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[176+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[192+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm4,XMMWORD[208+r8]
+ vaesenc xmm7,xmm7,xmm4
+ vaesenc xmm8,xmm8,xmm4
+ vaesenc xmm9,xmm9,xmm4
+ vaesenc xmm10,xmm10,xmm4
+ vaesenc xmm11,xmm11,xmm4
+ vaesenc xmm12,xmm12,xmm4
+
+ vmovdqu xmm6,XMMWORD[224+r8]
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm4,xmm6,XMMWORD[rdi]
+ vaesenclast xmm7,xmm7,xmm4
+ vpxor xmm4,xmm6,XMMWORD[16+rdi]
+ vaesenclast xmm8,xmm8,xmm4
+ vpxor xmm4,xmm6,XMMWORD[32+rdi]
+ vaesenclast xmm9,xmm9,xmm4
+ vpxor xmm4,xmm6,XMMWORD[48+rdi]
+ vaesenclast xmm10,xmm10,xmm4
+ vpxor xmm4,xmm6,XMMWORD[64+rdi]
+ vaesenclast xmm11,xmm11,xmm4
+ vpxor xmm4,xmm6,XMMWORD[80+rdi]
+ vaesenclast xmm12,xmm12,xmm4
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vmovdqu XMMWORD[rsi],xmm7
+ vmovdqu XMMWORD[16+rsi],xmm8
+ vmovdqu XMMWORD[32+rsi],xmm9
+ vmovdqu XMMWORD[48+rsi],xmm10
+ vmovdqu XMMWORD[64+rsi],xmm11
+ vmovdqu XMMWORD[80+rsi],xmm12
+
+ vpxor xmm0,xmm0,xmm5
+
+ lea rdi,[96+rdi]
+ lea rsi,[96+rsi]
+ jmp NEAR $L$256_dec_loop1
+
+$L$256_dec_finish_96:
+ vmovdqa xmm6,xmm12
+ vmovdqa XMMWORD[(16-32)+rax],xmm11
+ vmovdqa XMMWORD[(32-32)+rax],xmm10
+ vmovdqa XMMWORD[(48-32)+rax],xmm9
+ vmovdqa XMMWORD[(64-32)+rax],xmm8
+ vmovdqa XMMWORD[(80-32)+rax],xmm7
+
+ vmovdqu xmm4,XMMWORD[((0-32))+rcx]
+ vpclmulqdq xmm1,xmm6,xmm4,0x10
+ vpclmulqdq xmm2,xmm6,xmm4,0x11
+ vpclmulqdq xmm3,xmm6,xmm4,0x00
+ vpclmulqdq xmm4,xmm6,xmm4,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[((-16))+rax]
+ vmovdqu xmm13,XMMWORD[((-16))+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[rax]
+ vmovdqu xmm13,XMMWORD[rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[16+rax]
+ vmovdqu xmm13,XMMWORD[16+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vmovdqu xmm6,XMMWORD[32+rax]
+ vmovdqu xmm13,XMMWORD[32+rcx]
+
+ vpclmulqdq xmm4,xmm6,xmm13,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm13,0x01
+ vpxor xmm1,xmm1,xmm4
+
+
+ vmovdqu xmm6,XMMWORD[((80-32))+rax]
+ vpxor xmm6,xmm6,xmm0
+ vmovdqu xmm5,XMMWORD[((80-32))+rcx]
+ vpclmulqdq xmm4,xmm6,xmm5,0x11
+ vpxor xmm2,xmm2,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x00
+ vpxor xmm3,xmm3,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x10
+ vpxor xmm1,xmm1,xmm4
+ vpclmulqdq xmm4,xmm6,xmm5,0x01
+ vpxor xmm1,xmm1,xmm4
+
+ vpsrldq xmm4,xmm1,8
+ vpxor xmm5,xmm2,xmm4
+ vpslldq xmm4,xmm1,8
+ vpxor xmm0,xmm3,xmm4
+
+ vmovdqa xmm3,XMMWORD[poly]
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpalignr xmm2,xmm0,xmm0,8
+ vpclmulqdq xmm0,xmm0,xmm3,0x10
+ vpxor xmm0,xmm2,xmm0
+
+ vpxor xmm0,xmm0,xmm5
+
+$L$256_dec_loop2:
+
+
+
+ cmp r9,16
+ jb NEAR $L$256_dec_out
+ sub r9,16
+
+ vmovdqa xmm2,xmm15
+ vpaddd xmm15,xmm15,XMMWORD[one]
+
+ vpxor xmm2,xmm2,XMMWORD[r8]
+ vaesenc xmm2,xmm2,XMMWORD[16+r8]
+ vaesenc xmm2,xmm2,XMMWORD[32+r8]
+ vaesenc xmm2,xmm2,XMMWORD[48+r8]
+ vaesenc xmm2,xmm2,XMMWORD[64+r8]
+ vaesenc xmm2,xmm2,XMMWORD[80+r8]
+ vaesenc xmm2,xmm2,XMMWORD[96+r8]
+ vaesenc xmm2,xmm2,XMMWORD[112+r8]
+ vaesenc xmm2,xmm2,XMMWORD[128+r8]
+ vaesenc xmm2,xmm2,XMMWORD[144+r8]
+ vaesenc xmm2,xmm2,XMMWORD[160+r8]
+ vaesenc xmm2,xmm2,XMMWORD[176+r8]
+ vaesenc xmm2,xmm2,XMMWORD[192+r8]
+ vaesenc xmm2,xmm2,XMMWORD[208+r8]
+ vaesenclast xmm2,xmm2,XMMWORD[224+r8]
+ vpxor xmm2,xmm2,XMMWORD[rdi]
+ vmovdqu XMMWORD[rsi],xmm2
+ add rdi,16
+ add rsi,16
+
+ vpxor xmm0,xmm0,xmm2
+ vmovdqa xmm1,XMMWORD[((-32))+rcx]
+ call GFMUL
+
+ jmp NEAR $L$256_dec_loop2
+
+$L$256_dec_out:
+ vmovdqu XMMWORD[rdx],xmm0
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes256gcmsiv_dec:
+global aes256gcmsiv_kdf
+
+ALIGN 16
+aes256gcmsiv_kdf:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aes256gcmsiv_kdf:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+_CET_ENDBR
+
+
+
+
+ vmovdqa xmm1,XMMWORD[rdx]
+ vmovdqa xmm4,XMMWORD[rdi]
+ vmovdqa xmm11,XMMWORD[and_mask]
+ vmovdqa xmm8,XMMWORD[one]
+ vpshufd xmm4,xmm4,0x90
+ vpand xmm4,xmm4,xmm11
+ vpaddd xmm6,xmm4,xmm8
+ vpaddd xmm7,xmm6,xmm8
+ vpaddd xmm11,xmm7,xmm8
+ vpaddd xmm12,xmm11,xmm8
+ vpaddd xmm13,xmm12,xmm8
+
+ vpxor xmm4,xmm4,xmm1
+ vpxor xmm6,xmm6,xmm1
+ vpxor xmm7,xmm7,xmm1
+ vpxor xmm11,xmm11,xmm1
+ vpxor xmm12,xmm12,xmm1
+ vpxor xmm13,xmm13,xmm1
+
+ vmovdqa xmm1,XMMWORD[16+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[32+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[48+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[64+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[80+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[96+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[112+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[128+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[144+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[160+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[176+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[192+rdx]
+ vaesenc xmm4,xmm4,xmm2
+ vaesenc xmm6,xmm6,xmm2
+ vaesenc xmm7,xmm7,xmm2
+ vaesenc xmm11,xmm11,xmm2
+ vaesenc xmm12,xmm12,xmm2
+ vaesenc xmm13,xmm13,xmm2
+
+ vmovdqa xmm1,XMMWORD[208+rdx]
+ vaesenc xmm4,xmm4,xmm1
+ vaesenc xmm6,xmm6,xmm1
+ vaesenc xmm7,xmm7,xmm1
+ vaesenc xmm11,xmm11,xmm1
+ vaesenc xmm12,xmm12,xmm1
+ vaesenc xmm13,xmm13,xmm1
+
+ vmovdqa xmm2,XMMWORD[224+rdx]
+ vaesenclast xmm4,xmm4,xmm2
+ vaesenclast xmm6,xmm6,xmm2
+ vaesenclast xmm7,xmm7,xmm2
+ vaesenclast xmm11,xmm11,xmm2
+ vaesenclast xmm12,xmm12,xmm2
+ vaesenclast xmm13,xmm13,xmm2
+
+
+ vmovdqa XMMWORD[rsi],xmm4
+ vmovdqa XMMWORD[16+rsi],xmm6
+ vmovdqa XMMWORD[32+rsi],xmm7
+ vmovdqa XMMWORD[48+rsi],xmm11
+ vmovdqa XMMWORD[64+rsi],xmm12
+ vmovdqa XMMWORD[80+rsi],xmm13
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_aes256gcmsiv_kdf:
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/chacha-armv4-linux.S b/gen/crypto/chacha-armv4-linux.S
new file mode 100644
index 0000000..2255dd2
--- /dev/null
+++ b/gen/crypto/chacha-armv4-linux.S
@@ -0,0 +1,1451 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+
+.globl ChaCha20_ctr32_nohw
+.hidden ChaCha20_ctr32_nohw
+.type ChaCha20_ctr32_nohw,%function
+.align 5
+ChaCha20_ctr32_nohw:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ adr r14,.Lsigma
+ ldmia r12,{r4,r5,r6,r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+ stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce
+ ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
+ ldmia r14,{r0,r1,r2,r3} @ load sigma
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key
+ stmdb sp!,{r0,r1,r2,r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
+ str r11,[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs r11,r11,#1
+ add r0,r0,r4
+ mov r12,r12,ror#16
+ add r1,r1,r5
+ mov r10,r10,ror#16
+ eor r12,r12,r0,ror#16
+ eor r10,r10,r1,ror#16
+ add r8,r8,r12
+ mov r4,r4,ror#20
+ add r9,r9,r10
+ mov r5,r5,ror#20
+ eor r4,r4,r8,ror#20
+ eor r5,r5,r9,ror#20
+ add r0,r0,r4
+ mov r12,r12,ror#24
+ add r1,r1,r5
+ mov r10,r10,ror#24
+ eor r12,r12,r0,ror#24
+ eor r10,r10,r1,ror#24
+ add r8,r8,r12
+ mov r4,r4,ror#25
+ add r9,r9,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+13)]
+ ldr r10,[sp,#4*(16+15)]
+ eor r4,r4,r8,ror#25
+ eor r5,r5,r9,ror#25
+ str r8,[sp,#4*(16+8)]
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6
+ mov r14,r14,ror#16
+ str r9,[sp,#4*(16+9)]
+ ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7
+ mov r10,r10,ror#16
+ eor r14,r14,r2,ror#16
+ eor r10,r10,r3,ror#16
+ add r8,r8,r14
+ mov r6,r6,ror#20
+ add r9,r9,r10
+ mov r7,r7,ror#20
+ eor r6,r6,r8,ror#20
+ eor r7,r7,r9,ror#20
+ add r2,r2,r6
+ mov r14,r14,ror#24
+ add r3,r3,r7
+ mov r10,r10,ror#24
+ eor r14,r14,r2,ror#24
+ eor r10,r10,r3,ror#24
+ add r8,r8,r14
+ mov r6,r6,ror#25
+ add r9,r9,r10
+ mov r7,r7,ror#25
+ eor r6,r6,r8,ror#25
+ eor r7,r7,r9,ror#25
+ add r0,r0,r5
+ mov r10,r10,ror#16
+ add r1,r1,r6
+ mov r12,r12,ror#16
+ eor r10,r10,r0,ror#16
+ eor r12,r12,r1,ror#16
+ add r8,r8,r10
+ mov r5,r5,ror#20
+ add r9,r9,r12
+ mov r6,r6,ror#20
+ eor r5,r5,r8,ror#20
+ eor r6,r6,r9,ror#20
+ add r0,r0,r5
+ mov r10,r10,ror#24
+ add r1,r1,r6
+ mov r12,r12,ror#24
+ eor r10,r10,r0,ror#24
+ eor r12,r12,r1,ror#24
+ add r8,r8,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+15)]
+ ldr r10,[sp,#4*(16+13)]
+ add r9,r9,r12
+ mov r6,r6,ror#25
+ eor r5,r5,r8,ror#25
+ eor r6,r6,r9,ror#25
+ str r8,[sp,#4*(16+10)]
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7
+ mov r10,r10,ror#16
+ str r9,[sp,#4*(16+11)]
+ ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4
+ mov r14,r14,ror#16
+ eor r10,r10,r2,ror#16
+ eor r14,r14,r3,ror#16
+ add r8,r8,r10
+ mov r7,r7,ror#20
+ add r9,r9,r14
+ mov r4,r4,ror#20
+ eor r7,r7,r8,ror#20
+ eor r4,r4,r9,ror#20
+ add r2,r2,r7
+ mov r10,r10,ror#24
+ add r3,r3,r4
+ mov r14,r14,ror#24
+ eor r10,r10,r2,ror#24
+ eor r14,r14,r3,ror#24
+ add r8,r8,r10
+ mov r7,r7,ror#25
+ add r9,r9,r14
+ mov r4,r4,ror#25
+ eor r7,r7,r8,ror#25
+ eor r4,r4,r9,ror#25
+ bne .Loop
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ cmp r11,#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr r8,[sp,#4*(0)] @ load key material
+ ldr r9,[sp,#4*(1)]
+
+#if __ARM_ARCH>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH<7
+ orr r10,r12,r14
+ tst r10,#3 @ are input and output aligned?
+ ldr r10,[sp,#4*(2)]
+ bne .Lunaligned
+ cmp r11,#64 @ restore flags
+# else
+ ldr r10,[sp,#4*(2)]
+# endif
+ ldr r11,[sp,#4*(3)]
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8 @ xor with input
+ eorhs r1,r1,r9
+ add r8,sp,#4*(4)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r1,[r14,#-12]
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+ add r8,sp,#4*(8)
+ str r4,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r5,[r14,#-12]
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8
+ eorhs r1,r1,r9
+ add r8,sp,#4*(12)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ str r1,[r14,#-12]
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r4,[r14],#16 @ store output
+ str r5,[r14,#-12]
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+# if __ARM_ARCH<7
+ b .Ltail
+
+.align 4
+.Lunaligned:@ unaligned endian-neutral path
+ cmp r11,#64 @ restore flags
+# endif
+#endif
+#if __ARM_ARCH<7
+ ldr r11,[sp,#4*(3)]
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+0)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ add r0,sp,#4*(16+8)
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+ add r6,r6,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+ add r8,sp,#4*(4+4)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx"
+ strhi r11,[sp,#4*(16+11)] @ copy "rx"
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+8)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ add r4,r4,r8 @ accumulate key material
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+ add r5,r5,r9
+ add r6,r6,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add r9,sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb r10,[r9],#1 @ read buffer on stack
+ ldrb r11,[r12],#1 @ read input
+ subs r8,r8,#1
+ eor r11,r11,r10
+ strb r11,[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl ChaCha20_ctr32_neon
+.hidden ChaCha20_ctr32_neon
+.type ChaCha20_ctr32_neon,%function
+.align 5
+ChaCha20_ctr32_neon:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ adr r14,.Lsigma
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so
+ stmdb sp!,{r0,r1,r2,r3}
+
+ vld1.32 {q1,q2},[r3] @ load key
+ ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {q3},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0,r1,r2,r3} @ load sigma
+ vld1.32 {q0},[r14]! @ load sigma
+ vld1.32 {q12},[r14] @ one
+ vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {q0,q1},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ vshl.i32 d26,d24,#1 @ two
+ vstr d24,[sp,#4*(16+0)]
+ vshl.i32 d28,d24,#2 @ four
+ vstr d26,[sp,#4*(16+2)]
+ vmov q4,q0
+ vstr d28,[sp,#4*(16+4)]
+ vmov q8,q0
+ vmov q5,q1
+ vmov q9,q1
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
+ cmp r11,#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ vmov q4,q0
+ str r11,[sp,#4*(32+2)] @ save len
+ vmov q8,q0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov q5,q1
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov q9,q1
+.Loop_neon_enter:
+ ldr r11, [sp,#4*(15)]
+ vadd.i32 q7,q3,q12 @ counter+1
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ vmov q6,q2
+ ldr r10, [sp,#4*(13)]
+ vmov q10,q2
+ ldr r14,[sp,#4*(14)]
+ vadd.i32 q11,q7,q12 @ counter+2
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ add r12,r12,#3 @ counter+3
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs r11,r11,#1
+ vadd.i32 q0,q0,q1
+ add r0,r0,r4
+ vadd.i32 q4,q4,q5
+ mov r12,r12,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r5
+ veor q3,q3,q0
+ mov r10,r10,ror#16
+ veor q7,q7,q4
+ eor r12,r12,r0,ror#16
+ veor q11,q11,q8
+ eor r10,r10,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r12
+ vrev32.16 q7,q7
+ mov r4,r4,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r10
+ vadd.i32 q2,q2,q3
+ mov r5,r5,ror#20
+ vadd.i32 q6,q6,q7
+ eor r4,r4,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r5,r5,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r4
+ veor q13,q5,q6
+ mov r12,r12,ror#24
+ veor q14,q9,q10
+ add r1,r1,r5
+ vshr.u32 q1,q12,#20
+ mov r10,r10,ror#24
+ vshr.u32 q5,q13,#20
+ eor r12,r12,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r10,r10,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r12
+ vsli.32 q5,q13,#12
+ mov r4,r4,ror#25
+ vsli.32 q9,q14,#12
+ add r9,r9,r10
+ vadd.i32 q0,q0,q1
+ mov r5,r5,ror#25
+ vadd.i32 q4,q4,q5
+ str r10,[sp,#4*(16+13)]
+ vadd.i32 q8,q8,q9
+ ldr r10,[sp,#4*(16+15)]
+ veor q12,q3,q0
+ eor r4,r4,r8,ror#25
+ veor q13,q7,q4
+ eor r5,r5,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+8)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+10)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r6
+ vshr.u32 q11,q14,#24
+ mov r14,r14,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+9)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+11)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r7
+ vadd.i32 q2,q2,q3
+ mov r10,r10,ror#16
+ vadd.i32 q6,q6,q7
+ eor r14,r14,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r10,r10,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r14
+ veor q13,q5,q6
+ mov r6,r6,ror#20
+ veor q14,q9,q10
+ add r9,r9,r10
+ vshr.u32 q1,q12,#25
+ mov r7,r7,ror#20
+ vshr.u32 q5,q13,#25
+ eor r6,r6,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r7,r7,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r6
+ vsli.32 q5,q13,#7
+ mov r14,r14,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r7
+ vext.8 q2,q2,q2,#8
+ mov r10,r10,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r14,r14,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r10,r10,r3,ror#24
+ vext.8 q1,q1,q1,#4
+ add r8,r8,r14
+ vext.8 q5,q5,q5,#4
+ mov r6,r6,ror#25
+ vext.8 q9,q9,q9,#4
+ add r9,r9,r10
+ vext.8 q3,q3,q3,#12
+ mov r7,r7,ror#25
+ vext.8 q7,q7,q7,#12
+ eor r6,r6,r8,ror#25
+ vext.8 q11,q11,q11,#12
+ eor r7,r7,r9,ror#25
+ vadd.i32 q0,q0,q1
+ add r0,r0,r5
+ vadd.i32 q4,q4,q5
+ mov r10,r10,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r6
+ veor q3,q3,q0
+ mov r12,r12,ror#16
+ veor q7,q7,q4
+ eor r10,r10,r0,ror#16
+ veor q11,q11,q8
+ eor r12,r12,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r10
+ vrev32.16 q7,q7
+ mov r5,r5,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r12
+ vadd.i32 q2,q2,q3
+ mov r6,r6,ror#20
+ vadd.i32 q6,q6,q7
+ eor r5,r5,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r6,r6,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r5
+ veor q13,q5,q6
+ mov r10,r10,ror#24
+ veor q14,q9,q10
+ add r1,r1,r6
+ vshr.u32 q1,q12,#20
+ mov r12,r12,ror#24
+ vshr.u32 q5,q13,#20
+ eor r10,r10,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r12,r12,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r10
+ vsli.32 q5,q13,#12
+ mov r5,r5,ror#25
+ vsli.32 q9,q14,#12
+ str r10,[sp,#4*(16+15)]
+ vadd.i32 q0,q0,q1
+ ldr r10,[sp,#4*(16+13)]
+ vadd.i32 q4,q4,q5
+ add r9,r9,r12
+ vadd.i32 q8,q8,q9
+ mov r6,r6,ror#25
+ veor q12,q3,q0
+ eor r5,r5,r8,ror#25
+ veor q13,q7,q4
+ eor r6,r6,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+10)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+8)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r7
+ vshr.u32 q11,q14,#24
+ mov r10,r10,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+11)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+9)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r4
+ vadd.i32 q2,q2,q3
+ mov r14,r14,ror#16
+ vadd.i32 q6,q6,q7
+ eor r10,r10,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r14,r14,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r10
+ veor q13,q5,q6
+ mov r7,r7,ror#20
+ veor q14,q9,q10
+ add r9,r9,r14
+ vshr.u32 q1,q12,#25
+ mov r4,r4,ror#20
+ vshr.u32 q5,q13,#25
+ eor r7,r7,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r4,r4,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r7
+ vsli.32 q5,q13,#7
+ mov r10,r10,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r4
+ vext.8 q2,q2,q2,#8
+ mov r14,r14,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r10,r10,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r14,r14,r3,ror#24
+ vext.8 q1,q1,q1,#12
+ add r8,r8,r10
+ vext.8 q5,q5,q5,#12
+ mov r7,r7,ror#25
+ vext.8 q9,q9,q9,#12
+ add r9,r9,r14
+ vext.8 q3,q3,q3,#4
+ mov r4,r4,ror#25
+ vext.8 q7,q7,q7,#4
+ eor r7,r7,r8,ror#25
+ vext.8 q11,q11,q11,#4
+ eor r4,r4,r9,ror#25
+ bne .Loop_neon
+
+ add r11,sp,#32
+ vld1.32 {q12,q13},[sp] @ load key material
+ vld1.32 {q14,q15},[r11]
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 q0,q0,q12 @ accumulate key material
+ vadd.i32 q4,q4,q12
+ vadd.i32 q8,q8,q12
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ vadd.i32 q1,q1,q13
+ vadd.i32 q5,q5,q13
+ vadd.i32 q9,q9,q13
+ vldr d26,[sp,#4*(16+2)] @ two
+
+ vadd.i32 q2,q2,q14
+ vadd.i32 q6,q6,q14
+ vadd.i32 q10,q10,q14
+ vadd.i32 d14,d14,d24 @ counter+1
+ vadd.i32 d22,d22,d26 @ counter+2
+
+ vadd.i32 q3,q3,q15
+ vadd.i32 q7,q7,q15
+ vadd.i32 q11,q11,q15
+
+ cmp r11,#64*4
+ blo .Ltail_neon
+
+ vld1.8 {q12,q13},[r12]! @ load input
+ mov r11,sp
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12 @ xor with input
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ vst1.8 {q0,q1},[r14]! @ store output
+ veor q5,q5,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q2,q3},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q8,q8,q12
+ vld1.32 {q0,q1},[r11]! @ load for next iteration
+ veor d25,d25,d25
+ vldr d24,[sp,#4*(16+4)] @ four
+ veor q9,q9,q13
+ vld1.32 {q2,q3},[r11]
+ veor q10,q10,q14
+ vst1.8 {q4,q5},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q6,q7},[r14]!
+
+ vadd.i32 d6,d6,d24 @ next counter value
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{r8,r9,r10,r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ vst1.8 {q8,q9},[r14]!
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+ vst1.8 {q10,q11},[r14]!
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8 @ xor with input
+ add r8,sp,#4*(4)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r5,r5,r9
+ ldr r9,[r12,#-12]
+ add r6,r6,r10
+ ldr r10,[r12,#-8]
+ add r7,r7,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+ add r8,sp,#4*(8)
+ eor r5,r5,r9
+ str r4,[r14],#16 @ store output
+ eor r6,r6,r10
+ str r5,[r14,#-12]
+ eor r7,r7,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8
+ add r8,sp,#4*(12)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,r8,#4 @ next counter value
+ add r5,r5,r9
+ str r8,[sp,#4*(12)] @ save next counter value
+ ldr r8,[r12],#16 @ load input
+ add r6,r6,r10
+ add r4,r4,#3 @ counter+3
+ ldr r9,[r12,#-12]
+ add r7,r7,r11
+ ldr r10,[r12,#-8]
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi r8,[sp,#4*(32+2)] @ re-load len
+ eor r5,r5,r9
+ eor r6,r6,r10
+ str r4,[r14],#16 @ store output
+ eor r7,r7,r11
+ str r5,[r14,#-12]
+ sub r11,r8,#64*4 @ len-=64*4
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str r11, [sp,#4*(20+32+2)] @ save len
+ add r11,sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr r12,[sp,#4*(16+10)]
+ ldr r14,[sp,#4*(16+11)]
+ vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement
+ str r12,[sp,#4*(20+16+10)] @ copy "rx"
+ str r14,[sp,#4*(20+16+11)] @ copy "rx"
+
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(20+16+15)]
+ add r11,sp,#4*(20)
+ vst1.32 {q0,q1},[r11]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {q2,q3},[r11]
+ mov r11,#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp r11,#64*3
+ bhs .L192_or_more_neon
+ cmp r11,#64*2
+ bhs .L128_or_more_neon
+ cmp r11,#64*1
+ bhs .L64_or_more_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q0,q1},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q2,q3},[r8]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vst1.8 {q0,q1},[r14]!
+ vst1.8 {q2,q3},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q4,q5},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q6,q7},[r8]
+ sub r11,r11,#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vst1.8 {q0,q1},[r14]!
+ veor q6,q6,q14
+ vst1.8 {q2,q3},[r14]!
+ veor q7,q7,q15
+ vst1.8 {q4,q5},[r14]!
+ vst1.8 {q6,q7},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q8,q9},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q10,q11},[r8]
+ sub r11,r11,#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q0,q1},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q8,q8,q12
+ vst1.8 {q2,q3},[r14]!
+ veor q9,q9,q13
+ vst1.8 {q4,q5},[r14]!
+ veor q10,q10,q14
+ vst1.8 {q6,q7},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q8,q9},[r14]!
+ vst1.8 {q10,q11},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{r8,r9,r10,r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(4)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r6,r6,r10
+ add r7,r7,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+ add r0,sp,#4*(16+8)
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(12)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r4,r4,#3 @ counter+3
+ add r6,r6,r10
+ add r7,r7,r11
+ ldr r11,[sp,#4*(32+2)] @ re-load len
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+ add r10,sp,#4*(0)
+ sub r11,r11,#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb r8,[r10],#1 @ read buffer on stack
+ ldrb r9,[r12],#1 @ read input
+ subs r11,r11,#1
+ eor r8,r8,r9
+ strb r8,[r14],#1 @ store output
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+ add sp,sp,#4*(16+3)
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+#endif
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/crypto/chacha-armv8-apple.S b/gen/crypto/chacha-armv8-apple.S
new file mode 100644
index 0000000..3807631
--- /dev/null
+++ b/gen/crypto/chacha-armv8-apple.S
@@ -0,0 +1,1968 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.section __TEXT,__const
+
+.align 5
+Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+Lone:
+.long 1,0,0,0
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.text
+
+.globl _ChaCha20_ctr32_nohw
+.private_extern _ChaCha20_ctr32_nohw
+
+.align 5
+_ChaCha20_ctr32_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma@PAGE
+ add x5,x5,Lsigma@PAGEOFF
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __AARCH64EB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.align 4
+Ltail:
+ add x2,x2,#64
+Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl _ChaCha20_ctr32_neon
+.private_extern _ChaCha20_ctr32_neon
+
+.align 5
+_ChaCha20_ctr32_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma@PAGE
+ add x5,x5,Lsigma@PAGEOFF
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b Last_neon
+
+Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b Last_neon
+Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b Last_neon
+
+.align 4
+Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.align 5
+ChaCha20_512_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma@PAGE
+ add x5,x5,Lsigma@PAGEOFF
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b Loop_outer
+
+Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/crypto/chacha-armv8-linux.S b/gen/crypto/chacha-armv8-linux.S
new file mode 100644
index 0000000..55fa583
--- /dev/null
+++ b/gen/crypto/chacha-armv8-linux.S
@@ -0,0 +1,1968 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.section .rodata
+
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.text
+
+.globl ChaCha20_ctr32_nohw
+.hidden ChaCha20_ctr32_nohw
+.type ChaCha20_ctr32_nohw,%function
+.align 5
+ChaCha20_ctr32_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,.Lsigma
+ add x5,x5,:lo12:.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __AARCH64EB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+.Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+.Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,.Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo .Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.align 4
+.Ltail:
+ add x2,x2,#64
+.Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+
+.globl ChaCha20_ctr32_neon
+.hidden ChaCha20_ctr32_neon
+.type ChaCha20_ctr32_neon,%function
+.align 5
+ChaCha20_ctr32_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,.Lsigma
+ add x5,x5,:lo12:.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+.Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,.Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo .Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo .Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+.Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon
+.type ChaCha20_512_neon,%function
+.align 5
+ChaCha20_512_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,.Lsigma
+ add x5,x5,:lo12:.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+.Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+.Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs .Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ChaCha20_512_neon,.-ChaCha20_512_neon
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/crypto/chacha-armv8-win.S b/gen/crypto/chacha-armv8-win.S
new file mode 100644
index 0000000..851ef4d
--- /dev/null
+++ b/gen/crypto/chacha-armv8-win.S
@@ -0,0 +1,1974 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.section .rodata
+
+.align 5
+Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+Lone:
+.long 1,0,0,0
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.text
+
+.globl ChaCha20_ctr32_nohw
+
+.def ChaCha20_ctr32_nohw
+ .type 32
+.endef
+.align 5
+ChaCha20_ctr32_nohw:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma
+ add x5,x5,:lo12:Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __AARCH64EB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.align 4
+Ltail:
+ add x2,x2,#64
+Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl ChaCha20_ctr32_neon
+
+.def ChaCha20_ctr32_neon
+ .type 32
+.endef
+.align 5
+ChaCha20_ctr32_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma
+ add x5,x5,:lo12:Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b Last_neon
+
+Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b Last_neon
+Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b Last_neon
+
+.align 4
+Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.def ChaCha20_512_neon
+ .type 32
+.endef
+.align 5
+ChaCha20_512_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma
+ add x5,x5,:lo12:Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b Loop_outer
+
+Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/crypto/chacha-x86-apple.S b/gen/crypto/chacha-x86-apple.S
new file mode 100644
index 0000000..48293da
--- /dev/null
+++ b/gen/crypto/chacha-x86-apple.S
@@ -0,0 +1,957 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _ChaCha20_ctr32_nohw
+.private_extern _ChaCha20_ctr32_nohw
+.align 4
+_ChaCha20_ctr32_nohw:
+L_ChaCha20_ctr32_nohw_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ subl $132,%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,80(%esp)
+ movl %ebx,84(%esp)
+ movl %ecx,88(%esp)
+ movl %edx,92(%esp)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ movl %eax,96(%esp)
+ movl %ebx,100(%esp)
+ movl %ecx,104(%esp)
+ movl %edx,108(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ subl $1,%eax
+ movl %eax,112(%esp)
+ movl %ebx,116(%esp)
+ movl %ecx,120(%esp)
+ movl %edx,124(%esp)
+ jmp L000entry
+.align 4,0x90
+L001outer_loop:
+ movl %ebx,156(%esp)
+ movl %eax,152(%esp)
+ movl %ecx,160(%esp)
+L000entry:
+ movl $1634760805,%eax
+ movl $857760878,4(%esp)
+ movl $2036477234,8(%esp)
+ movl $1797285236,12(%esp)
+ movl 84(%esp),%ebx
+ movl 88(%esp),%ebp
+ movl 104(%esp),%ecx
+ movl 108(%esp),%esi
+ movl 116(%esp),%edx
+ movl 120(%esp),%edi
+ movl %ebx,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ecx,40(%esp)
+ movl %esi,44(%esp)
+ movl %edx,52(%esp)
+ movl %edi,56(%esp)
+ movl 92(%esp),%ebx
+ movl 124(%esp),%edi
+ movl 112(%esp),%edx
+ movl 80(%esp),%ebp
+ movl 96(%esp),%ecx
+ movl 100(%esp),%esi
+ addl $1,%edx
+ movl %ebx,28(%esp)
+ movl %edi,60(%esp)
+ movl %edx,112(%esp)
+ movl $10,%ebx
+ jmp L002loop
+.align 4,0x90
+L002loop:
+ addl %ebp,%eax
+ movl %ebx,128(%esp)
+ movl %ebp,%ebx
+ xorl %eax,%edx
+ roll $16,%edx
+ addl %edx,%ecx
+ xorl %ecx,%ebx
+ movl 52(%esp),%edi
+ roll $12,%ebx
+ movl 20(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,(%esp)
+ roll $8,%edx
+ movl 4(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,48(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ movl %ecx,32(%esp)
+ roll $16,%edi
+ movl %ebx,16(%esp)
+ addl %edi,%esi
+ movl 40(%esp),%ecx
+ xorl %esi,%ebp
+ movl 56(%esp),%edx
+ roll $12,%ebp
+ movl 24(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,4(%esp)
+ roll $8,%edi
+ movl 8(%esp),%eax
+ addl %edi,%esi
+ movl %edi,52(%esp)
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ movl %esi,36(%esp)
+ roll $16,%edx
+ movl %ebp,20(%esp)
+ addl %edx,%ecx
+ movl 44(%esp),%esi
+ xorl %ecx,%ebx
+ movl 60(%esp),%edi
+ roll $12,%ebx
+ movl 28(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,8(%esp)
+ roll $8,%edx
+ movl 12(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,56(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ roll $16,%edi
+ movl %ebx,24(%esp)
+ addl %edi,%esi
+ xorl %esi,%ebp
+ roll $12,%ebp
+ movl 20(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,12(%esp)
+ roll $8,%edi
+ movl (%esp),%eax
+ addl %edi,%esi
+ movl %edi,%edx
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ roll $16,%edx
+ movl %ebp,28(%esp)
+ addl %edx,%ecx
+ xorl %ecx,%ebx
+ movl 48(%esp),%edi
+ roll $12,%ebx
+ movl 24(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,(%esp)
+ roll $8,%edx
+ movl 4(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,60(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ movl %ecx,40(%esp)
+ roll $16,%edi
+ movl %ebx,20(%esp)
+ addl %edi,%esi
+ movl 32(%esp),%ecx
+ xorl %esi,%ebp
+ movl 52(%esp),%edx
+ roll $12,%ebp
+ movl 28(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,4(%esp)
+ roll $8,%edi
+ movl 8(%esp),%eax
+ addl %edi,%esi
+ movl %edi,48(%esp)
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ movl %esi,44(%esp)
+ roll $16,%edx
+ movl %ebp,24(%esp)
+ addl %edx,%ecx
+ movl 36(%esp),%esi
+ xorl %ecx,%ebx
+ movl 56(%esp),%edi
+ roll $12,%ebx
+ movl 16(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,8(%esp)
+ roll $8,%edx
+ movl 12(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,52(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ roll $16,%edi
+ movl %ebx,28(%esp)
+ addl %edi,%esi
+ xorl %esi,%ebp
+ movl 48(%esp),%edx
+ roll $12,%ebp
+ movl 128(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,12(%esp)
+ roll $8,%edi
+ movl (%esp),%eax
+ addl %edi,%esi
+ movl %edi,56(%esp)
+ xorl %esi,%ebp
+ roll $7,%ebp
+ decl %ebx
+ jnz L002loop
+ movl 160(%esp),%ebx
+ addl $1634760805,%eax
+ addl 80(%esp),%ebp
+ addl 96(%esp),%ecx
+ addl 100(%esp),%esi
+ cmpl $64,%ebx
+ jb L003tail
+ movl 156(%esp),%ebx
+ addl 112(%esp),%edx
+ addl 120(%esp),%edi
+ xorl (%ebx),%eax
+ xorl 16(%ebx),%ebp
+ movl %eax,(%esp)
+ movl 152(%esp),%eax
+ xorl 32(%ebx),%ecx
+ xorl 36(%ebx),%esi
+ xorl 48(%ebx),%edx
+ xorl 56(%ebx),%edi
+ movl %ebp,16(%eax)
+ movl %ecx,32(%eax)
+ movl %esi,36(%eax)
+ movl %edx,48(%eax)
+ movl %edi,56(%eax)
+ movl 4(%esp),%ebp
+ movl 8(%esp),%ecx
+ movl 12(%esp),%esi
+ movl 20(%esp),%edx
+ movl 24(%esp),%edi
+ addl $857760878,%ebp
+ addl $2036477234,%ecx
+ addl $1797285236,%esi
+ addl 84(%esp),%edx
+ addl 88(%esp),%edi
+ xorl 4(%ebx),%ebp
+ xorl 8(%ebx),%ecx
+ xorl 12(%ebx),%esi
+ xorl 20(%ebx),%edx
+ xorl 24(%ebx),%edi
+ movl %ebp,4(%eax)
+ movl %ecx,8(%eax)
+ movl %esi,12(%eax)
+ movl %edx,20(%eax)
+ movl %edi,24(%eax)
+ movl 28(%esp),%ebp
+ movl 40(%esp),%ecx
+ movl 44(%esp),%esi
+ movl 52(%esp),%edx
+ movl 60(%esp),%edi
+ addl 92(%esp),%ebp
+ addl 104(%esp),%ecx
+ addl 108(%esp),%esi
+ addl 116(%esp),%edx
+ addl 124(%esp),%edi
+ xorl 28(%ebx),%ebp
+ xorl 40(%ebx),%ecx
+ xorl 44(%ebx),%esi
+ xorl 52(%ebx),%edx
+ xorl 60(%ebx),%edi
+ leal 64(%ebx),%ebx
+ movl %ebp,28(%eax)
+ movl (%esp),%ebp
+ movl %ecx,40(%eax)
+ movl 160(%esp),%ecx
+ movl %esi,44(%eax)
+ movl %edx,52(%eax)
+ movl %edi,60(%eax)
+ movl %ebp,(%eax)
+ leal 64(%eax),%eax
+ subl $64,%ecx
+ jnz L001outer_loop
+ jmp L004done
+L003tail:
+ addl 112(%esp),%edx
+ addl 120(%esp),%edi
+ movl %eax,(%esp)
+ movl %ebp,16(%esp)
+ movl %ecx,32(%esp)
+ movl %esi,36(%esp)
+ movl %edx,48(%esp)
+ movl %edi,56(%esp)
+ movl 4(%esp),%ebp
+ movl 8(%esp),%ecx
+ movl 12(%esp),%esi
+ movl 20(%esp),%edx
+ movl 24(%esp),%edi
+ addl $857760878,%ebp
+ addl $2036477234,%ecx
+ addl $1797285236,%esi
+ addl 84(%esp),%edx
+ addl 88(%esp),%edi
+ movl %ebp,4(%esp)
+ movl %ecx,8(%esp)
+ movl %esi,12(%esp)
+ movl %edx,20(%esp)
+ movl %edi,24(%esp)
+ movl 28(%esp),%ebp
+ movl 40(%esp),%ecx
+ movl 44(%esp),%esi
+ movl 52(%esp),%edx
+ movl 60(%esp),%edi
+ addl 92(%esp),%ebp
+ addl 104(%esp),%ecx
+ addl 108(%esp),%esi
+ addl 116(%esp),%edx
+ addl 124(%esp),%edi
+ movl %ebp,28(%esp)
+ movl 156(%esp),%ebp
+ movl %ecx,40(%esp)
+ movl 152(%esp),%ecx
+ movl %esi,44(%esp)
+ xorl %esi,%esi
+ movl %edx,52(%esp)
+ movl %edi,60(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+L005tail_loop:
+ movb (%esi,%ebp,1),%al
+ movb (%esp,%esi,1),%dl
+ leal 1(%esi),%esi
+ xorb %dl,%al
+ movb %al,-1(%ecx,%esi,1)
+ decl %ebx
+ jnz L005tail_loop
+L004done:
+ addl $132,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _ChaCha20_ctr32_ssse3
+.private_extern _ChaCha20_ctr32_ssse3
+.align 4
+_ChaCha20_ctr32_ssse3:
+L_ChaCha20_ctr32_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call Lpic_point
+Lpic_point:
+ popl %eax
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal Lssse3_data-Lpic_point(%eax),%eax
+ movdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb L0061x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ movdqu (%edx),%xmm7
+ pshufd $0,%xmm3,%xmm0
+ pshufd $85,%xmm3,%xmm1
+ pshufd $170,%xmm3,%xmm2
+ pshufd $255,%xmm3,%xmm3
+ paddd 48(%eax),%xmm0
+ pshufd $0,%xmm7,%xmm4
+ pshufd $85,%xmm7,%xmm5
+ psubd 64(%eax),%xmm0
+ pshufd $170,%xmm7,%xmm6
+ pshufd $255,%xmm7,%xmm7
+ movdqa %xmm0,64(%ebp)
+ movdqa %xmm1,80(%ebp)
+ movdqa %xmm2,96(%ebp)
+ movdqa %xmm3,112(%ebp)
+ movdqu 16(%edx),%xmm3
+ movdqa %xmm4,-64(%ebp)
+ movdqa %xmm5,-48(%ebp)
+ movdqa %xmm6,-32(%ebp)
+ movdqa %xmm7,-16(%ebp)
+ movdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ pshufd $0,%xmm3,%xmm0
+ pshufd $85,%xmm3,%xmm1
+ pshufd $170,%xmm3,%xmm2
+ pshufd $255,%xmm3,%xmm3
+ pshufd $0,%xmm7,%xmm4
+ pshufd $85,%xmm7,%xmm5
+ pshufd $170,%xmm7,%xmm6
+ pshufd $255,%xmm7,%xmm7
+ movdqa %xmm0,(%ebp)
+ movdqa %xmm1,16(%ebp)
+ movdqa %xmm2,32(%ebp)
+ movdqa %xmm3,48(%ebp)
+ movdqa %xmm4,-128(%ebp)
+ movdqa %xmm5,-112(%ebp)
+ movdqa %xmm6,-96(%ebp)
+ movdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp L007outer_loop
+.align 4,0x90
+L007outer_loop:
+ movdqa -112(%ebp),%xmm1
+ movdqa -96(%ebp),%xmm2
+ movdqa -80(%ebp),%xmm3
+ movdqa -48(%ebp),%xmm5
+ movdqa -32(%ebp),%xmm6
+ movdqa -16(%ebp),%xmm7
+ movdqa %xmm1,-112(%ebx)
+ movdqa %xmm2,-96(%ebx)
+ movdqa %xmm3,-80(%ebx)
+ movdqa %xmm5,-48(%ebx)
+ movdqa %xmm6,-32(%ebx)
+ movdqa %xmm7,-16(%ebx)
+ movdqa 32(%ebp),%xmm2
+ movdqa 48(%ebp),%xmm3
+ movdqa 64(%ebp),%xmm4
+ movdqa 80(%ebp),%xmm5
+ movdqa 96(%ebp),%xmm6
+ movdqa 112(%ebp),%xmm7
+ paddd 64(%eax),%xmm4
+ movdqa %xmm2,32(%ebx)
+ movdqa %xmm3,48(%ebx)
+ movdqa %xmm4,64(%ebx)
+ movdqa %xmm5,80(%ebx)
+ movdqa %xmm6,96(%ebx)
+ movdqa %xmm7,112(%ebx)
+ movdqa %xmm4,64(%ebp)
+ movdqa -128(%ebp),%xmm0
+ movdqa %xmm4,%xmm6
+ movdqa -64(%ebp),%xmm3
+ movdqa (%ebp),%xmm4
+ movdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 4,0x90
+L008loop:
+ paddd %xmm3,%xmm0
+ movdqa %xmm3,%xmm2
+ pxor %xmm0,%xmm6
+ pshufb (%eax),%xmm6
+ paddd %xmm6,%xmm4
+ pxor %xmm4,%xmm2
+ movdqa -48(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -112(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 80(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-128(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,64(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ movdqa %xmm4,(%ebx)
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-64(%ebx)
+ paddd %xmm7,%xmm5
+ movdqa 32(%ebx),%xmm4
+ pxor %xmm5,%xmm3
+ movdqa -32(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -96(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 96(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-112(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,80(%ebx)
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ movdqa %xmm5,16(%ebx)
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-48(%ebx)
+ paddd %xmm6,%xmm4
+ movdqa 48(%ebx),%xmm5
+ pxor %xmm4,%xmm2
+ movdqa -16(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -80(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 112(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-96(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,96(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-32(%ebx)
+ paddd %xmm7,%xmm5
+ pxor %xmm5,%xmm3
+ movdqa -48(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -128(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-80(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,%xmm6
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-16(%ebx)
+ paddd %xmm6,%xmm4
+ pxor %xmm4,%xmm2
+ movdqa -32(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -112(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 64(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-128(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,112(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ movdqa %xmm4,32(%ebx)
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-48(%ebx)
+ paddd %xmm7,%xmm5
+ movdqa (%ebx),%xmm4
+ pxor %xmm5,%xmm3
+ movdqa -16(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -96(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 80(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-112(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,64(%ebx)
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ movdqa %xmm5,48(%ebx)
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-32(%ebx)
+ paddd %xmm6,%xmm4
+ movdqa 16(%ebx),%xmm5
+ pxor %xmm4,%xmm2
+ movdqa -64(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -80(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 96(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-96(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,80(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-16(%ebx)
+ paddd %xmm7,%xmm5
+ pxor %xmm5,%xmm3
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -128(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 64(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-80(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,96(%ebx)
+ pxor %xmm5,%xmm3
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ por %xmm1,%xmm3
+ decl %edx
+ jnz L008loop
+ movdqa %xmm3,-64(%ebx)
+ movdqa %xmm4,(%ebx)
+ movdqa %xmm5,16(%ebx)
+ movdqa %xmm6,64(%ebx)
+ movdqa %xmm7,96(%ebx)
+ movdqa -112(%ebx),%xmm1
+ movdqa -96(%ebx),%xmm2
+ movdqa -80(%ebx),%xmm3
+ paddd -128(%ebp),%xmm0
+ paddd -112(%ebp),%xmm1
+ paddd -96(%ebp),%xmm2
+ paddd -80(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa -64(%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa -48(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa -32(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa -16(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd -64(%ebp),%xmm0
+ paddd -48(%ebp),%xmm1
+ paddd -32(%ebp),%xmm2
+ paddd -16(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa (%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa 16(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa 32(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa 48(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd (%ebp),%xmm0
+ paddd 16(%ebp),%xmm1
+ paddd 32(%ebp),%xmm2
+ paddd 48(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa 64(%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa 80(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa 96(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa 112(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd 64(%ebp),%xmm0
+ paddd 80(%ebp),%xmm1
+ paddd 96(%ebp),%xmm2
+ paddd 112(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 208(%esi),%esi
+ pxor %xmm0,%xmm4
+ pxor %xmm1,%xmm5
+ pxor %xmm2,%xmm6
+ pxor %xmm3,%xmm7
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc L007outer_loop
+ addl $256,%ecx
+ jz L009done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ movd 64(%ebp),%xmm2
+ movdqu (%ebx),%xmm3
+ paddd 96(%eax),%xmm2
+ pand 112(%eax),%xmm3
+ por %xmm2,%xmm3
+L0061x:
+ movdqa 32(%eax),%xmm0
+ movdqu (%edx),%xmm1
+ movdqu 16(%edx),%xmm2
+ movdqa (%eax),%xmm6
+ movdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp L010loop1x
+.align 4,0x90
+L011outer1x:
+ movdqa 80(%eax),%xmm3
+ movdqa (%esp),%xmm0
+ movdqa 16(%esp),%xmm1
+ movdqa 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ movl $10,%edx
+ movdqa %xmm3,48(%esp)
+ jmp L010loop1x
+.align 4,0x90
+L010loop1x:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz L010loop1x
+ paddd (%esp),%xmm0
+ paddd 16(%esp),%xmm1
+ paddd 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ cmpl $64,%ecx
+ jb L012tail
+ movdqu (%esi),%xmm4
+ movdqu 16(%esi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%esi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%esi),%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+ leal 64(%esi),%esi
+ movdqu %xmm0,(%edi)
+ movdqu %xmm1,16(%edi)
+ movdqu %xmm2,32(%edi)
+ movdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz L011outer1x
+ jmp L009done
+L012tail:
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+L013tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz L013tail_loop
+L009done:
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 6,0x90
+Lssse3_data:
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long 1634760805,857760878,2036477234,1797285236
+.long 0,1,2,3
+.long 4,4,4,4
+.long 1,0,0,0
+.long 4,0,0,0
+.long 0,-1,-1,-1
+.align 6,0x90
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte 114,103,62,0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/crypto/chacha-x86-linux.S b/gen/crypto/chacha-x86-linux.S
new file mode 100644
index 0000000..566fbb4
--- /dev/null
+++ b/gen/crypto/chacha-x86-linux.S
@@ -0,0 +1,961 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl ChaCha20_ctr32_nohw
+.hidden ChaCha20_ctr32_nohw
+.type ChaCha20_ctr32_nohw,@function
+.align 16
+ChaCha20_ctr32_nohw:
+.L_ChaCha20_ctr32_nohw_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ subl $132,%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,80(%esp)
+ movl %ebx,84(%esp)
+ movl %ecx,88(%esp)
+ movl %edx,92(%esp)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ movl %eax,96(%esp)
+ movl %ebx,100(%esp)
+ movl %ecx,104(%esp)
+ movl %edx,108(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ subl $1,%eax
+ movl %eax,112(%esp)
+ movl %ebx,116(%esp)
+ movl %ecx,120(%esp)
+ movl %edx,124(%esp)
+ jmp .L000entry
+.align 16
+.L001outer_loop:
+ movl %ebx,156(%esp)
+ movl %eax,152(%esp)
+ movl %ecx,160(%esp)
+.L000entry:
+ movl $1634760805,%eax
+ movl $857760878,4(%esp)
+ movl $2036477234,8(%esp)
+ movl $1797285236,12(%esp)
+ movl 84(%esp),%ebx
+ movl 88(%esp),%ebp
+ movl 104(%esp),%ecx
+ movl 108(%esp),%esi
+ movl 116(%esp),%edx
+ movl 120(%esp),%edi
+ movl %ebx,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ecx,40(%esp)
+ movl %esi,44(%esp)
+ movl %edx,52(%esp)
+ movl %edi,56(%esp)
+ movl 92(%esp),%ebx
+ movl 124(%esp),%edi
+ movl 112(%esp),%edx
+ movl 80(%esp),%ebp
+ movl 96(%esp),%ecx
+ movl 100(%esp),%esi
+ addl $1,%edx
+ movl %ebx,28(%esp)
+ movl %edi,60(%esp)
+ movl %edx,112(%esp)
+ movl $10,%ebx
+ jmp .L002loop
+.align 16
+.L002loop:
+ addl %ebp,%eax
+ movl %ebx,128(%esp)
+ movl %ebp,%ebx
+ xorl %eax,%edx
+ roll $16,%edx
+ addl %edx,%ecx
+ xorl %ecx,%ebx
+ movl 52(%esp),%edi
+ roll $12,%ebx
+ movl 20(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,(%esp)
+ roll $8,%edx
+ movl 4(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,48(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ movl %ecx,32(%esp)
+ roll $16,%edi
+ movl %ebx,16(%esp)
+ addl %edi,%esi
+ movl 40(%esp),%ecx
+ xorl %esi,%ebp
+ movl 56(%esp),%edx
+ roll $12,%ebp
+ movl 24(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,4(%esp)
+ roll $8,%edi
+ movl 8(%esp),%eax
+ addl %edi,%esi
+ movl %edi,52(%esp)
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ movl %esi,36(%esp)
+ roll $16,%edx
+ movl %ebp,20(%esp)
+ addl %edx,%ecx
+ movl 44(%esp),%esi
+ xorl %ecx,%ebx
+ movl 60(%esp),%edi
+ roll $12,%ebx
+ movl 28(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,8(%esp)
+ roll $8,%edx
+ movl 12(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,56(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ roll $16,%edi
+ movl %ebx,24(%esp)
+ addl %edi,%esi
+ xorl %esi,%ebp
+ roll $12,%ebp
+ movl 20(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,12(%esp)
+ roll $8,%edi
+ movl (%esp),%eax
+ addl %edi,%esi
+ movl %edi,%edx
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ roll $16,%edx
+ movl %ebp,28(%esp)
+ addl %edx,%ecx
+ xorl %ecx,%ebx
+ movl 48(%esp),%edi
+ roll $12,%ebx
+ movl 24(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,(%esp)
+ roll $8,%edx
+ movl 4(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,60(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ movl %ecx,40(%esp)
+ roll $16,%edi
+ movl %ebx,20(%esp)
+ addl %edi,%esi
+ movl 32(%esp),%ecx
+ xorl %esi,%ebp
+ movl 52(%esp),%edx
+ roll $12,%ebp
+ movl 28(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,4(%esp)
+ roll $8,%edi
+ movl 8(%esp),%eax
+ addl %edi,%esi
+ movl %edi,48(%esp)
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ movl %esi,44(%esp)
+ roll $16,%edx
+ movl %ebp,24(%esp)
+ addl %edx,%ecx
+ movl 36(%esp),%esi
+ xorl %ecx,%ebx
+ movl 56(%esp),%edi
+ roll $12,%ebx
+ movl 16(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,8(%esp)
+ roll $8,%edx
+ movl 12(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,52(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ roll $16,%edi
+ movl %ebx,28(%esp)
+ addl %edi,%esi
+ xorl %esi,%ebp
+ movl 48(%esp),%edx
+ roll $12,%ebp
+ movl 128(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,12(%esp)
+ roll $8,%edi
+ movl (%esp),%eax
+ addl %edi,%esi
+ movl %edi,56(%esp)
+ xorl %esi,%ebp
+ roll $7,%ebp
+ decl %ebx
+ jnz .L002loop
+ movl 160(%esp),%ebx
+ addl $1634760805,%eax
+ addl 80(%esp),%ebp
+ addl 96(%esp),%ecx
+ addl 100(%esp),%esi
+ cmpl $64,%ebx
+ jb .L003tail
+ movl 156(%esp),%ebx
+ addl 112(%esp),%edx
+ addl 120(%esp),%edi
+ xorl (%ebx),%eax
+ xorl 16(%ebx),%ebp
+ movl %eax,(%esp)
+ movl 152(%esp),%eax
+ xorl 32(%ebx),%ecx
+ xorl 36(%ebx),%esi
+ xorl 48(%ebx),%edx
+ xorl 56(%ebx),%edi
+ movl %ebp,16(%eax)
+ movl %ecx,32(%eax)
+ movl %esi,36(%eax)
+ movl %edx,48(%eax)
+ movl %edi,56(%eax)
+ movl 4(%esp),%ebp
+ movl 8(%esp),%ecx
+ movl 12(%esp),%esi
+ movl 20(%esp),%edx
+ movl 24(%esp),%edi
+ addl $857760878,%ebp
+ addl $2036477234,%ecx
+ addl $1797285236,%esi
+ addl 84(%esp),%edx
+ addl 88(%esp),%edi
+ xorl 4(%ebx),%ebp
+ xorl 8(%ebx),%ecx
+ xorl 12(%ebx),%esi
+ xorl 20(%ebx),%edx
+ xorl 24(%ebx),%edi
+ movl %ebp,4(%eax)
+ movl %ecx,8(%eax)
+ movl %esi,12(%eax)
+ movl %edx,20(%eax)
+ movl %edi,24(%eax)
+ movl 28(%esp),%ebp
+ movl 40(%esp),%ecx
+ movl 44(%esp),%esi
+ movl 52(%esp),%edx
+ movl 60(%esp),%edi
+ addl 92(%esp),%ebp
+ addl 104(%esp),%ecx
+ addl 108(%esp),%esi
+ addl 116(%esp),%edx
+ addl 124(%esp),%edi
+ xorl 28(%ebx),%ebp
+ xorl 40(%ebx),%ecx
+ xorl 44(%ebx),%esi
+ xorl 52(%ebx),%edx
+ xorl 60(%ebx),%edi
+ leal 64(%ebx),%ebx
+ movl %ebp,28(%eax)
+ movl (%esp),%ebp
+ movl %ecx,40(%eax)
+ movl 160(%esp),%ecx
+ movl %esi,44(%eax)
+ movl %edx,52(%eax)
+ movl %edi,60(%eax)
+ movl %ebp,(%eax)
+ leal 64(%eax),%eax
+ subl $64,%ecx
+ jnz .L001outer_loop
+ jmp .L004done
+.L003tail:
+ addl 112(%esp),%edx
+ addl 120(%esp),%edi
+ movl %eax,(%esp)
+ movl %ebp,16(%esp)
+ movl %ecx,32(%esp)
+ movl %esi,36(%esp)
+ movl %edx,48(%esp)
+ movl %edi,56(%esp)
+ movl 4(%esp),%ebp
+ movl 8(%esp),%ecx
+ movl 12(%esp),%esi
+ movl 20(%esp),%edx
+ movl 24(%esp),%edi
+ addl $857760878,%ebp
+ addl $2036477234,%ecx
+ addl $1797285236,%esi
+ addl 84(%esp),%edx
+ addl 88(%esp),%edi
+ movl %ebp,4(%esp)
+ movl %ecx,8(%esp)
+ movl %esi,12(%esp)
+ movl %edx,20(%esp)
+ movl %edi,24(%esp)
+ movl 28(%esp),%ebp
+ movl 40(%esp),%ecx
+ movl 44(%esp),%esi
+ movl 52(%esp),%edx
+ movl 60(%esp),%edi
+ addl 92(%esp),%ebp
+ addl 104(%esp),%ecx
+ addl 108(%esp),%esi
+ addl 116(%esp),%edx
+ addl 124(%esp),%edi
+ movl %ebp,28(%esp)
+ movl 156(%esp),%ebp
+ movl %ecx,40(%esp)
+ movl 152(%esp),%ecx
+ movl %esi,44(%esp)
+ xorl %esi,%esi
+ movl %edx,52(%esp)
+ movl %edi,60(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+.L005tail_loop:
+ movb (%esi,%ebp,1),%al
+ movb (%esp,%esi,1),%dl
+ leal 1(%esi),%esi
+ xorb %dl,%al
+ movb %al,-1(%ecx,%esi,1)
+ decl %ebx
+ jnz .L005tail_loop
+.L004done:
+ addl $132,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_ctr32_nohw,.-.L_ChaCha20_ctr32_nohw_begin
+.globl ChaCha20_ctr32_ssse3
+.hidden ChaCha20_ctr32_ssse3
+.type ChaCha20_ctr32_ssse3,@function
+.align 16
+ChaCha20_ctr32_ssse3:
+.L_ChaCha20_ctr32_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .Lpic_point
+.Lpic_point:
+ popl %eax
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ movdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0061x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ movdqu (%edx),%xmm7
+ pshufd $0,%xmm3,%xmm0
+ pshufd $85,%xmm3,%xmm1
+ pshufd $170,%xmm3,%xmm2
+ pshufd $255,%xmm3,%xmm3
+ paddd 48(%eax),%xmm0
+ pshufd $0,%xmm7,%xmm4
+ pshufd $85,%xmm7,%xmm5
+ psubd 64(%eax),%xmm0
+ pshufd $170,%xmm7,%xmm6
+ pshufd $255,%xmm7,%xmm7
+ movdqa %xmm0,64(%ebp)
+ movdqa %xmm1,80(%ebp)
+ movdqa %xmm2,96(%ebp)
+ movdqa %xmm3,112(%ebp)
+ movdqu 16(%edx),%xmm3
+ movdqa %xmm4,-64(%ebp)
+ movdqa %xmm5,-48(%ebp)
+ movdqa %xmm6,-32(%ebp)
+ movdqa %xmm7,-16(%ebp)
+ movdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ pshufd $0,%xmm3,%xmm0
+ pshufd $85,%xmm3,%xmm1
+ pshufd $170,%xmm3,%xmm2
+ pshufd $255,%xmm3,%xmm3
+ pshufd $0,%xmm7,%xmm4
+ pshufd $85,%xmm7,%xmm5
+ pshufd $170,%xmm7,%xmm6
+ pshufd $255,%xmm7,%xmm7
+ movdqa %xmm0,(%ebp)
+ movdqa %xmm1,16(%ebp)
+ movdqa %xmm2,32(%ebp)
+ movdqa %xmm3,48(%ebp)
+ movdqa %xmm4,-128(%ebp)
+ movdqa %xmm5,-112(%ebp)
+ movdqa %xmm6,-96(%ebp)
+ movdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L007outer_loop
+.align 16
+.L007outer_loop:
+ movdqa -112(%ebp),%xmm1
+ movdqa -96(%ebp),%xmm2
+ movdqa -80(%ebp),%xmm3
+ movdqa -48(%ebp),%xmm5
+ movdqa -32(%ebp),%xmm6
+ movdqa -16(%ebp),%xmm7
+ movdqa %xmm1,-112(%ebx)
+ movdqa %xmm2,-96(%ebx)
+ movdqa %xmm3,-80(%ebx)
+ movdqa %xmm5,-48(%ebx)
+ movdqa %xmm6,-32(%ebx)
+ movdqa %xmm7,-16(%ebx)
+ movdqa 32(%ebp),%xmm2
+ movdqa 48(%ebp),%xmm3
+ movdqa 64(%ebp),%xmm4
+ movdqa 80(%ebp),%xmm5
+ movdqa 96(%ebp),%xmm6
+ movdqa 112(%ebp),%xmm7
+ paddd 64(%eax),%xmm4
+ movdqa %xmm2,32(%ebx)
+ movdqa %xmm3,48(%ebx)
+ movdqa %xmm4,64(%ebx)
+ movdqa %xmm5,80(%ebx)
+ movdqa %xmm6,96(%ebx)
+ movdqa %xmm7,112(%ebx)
+ movdqa %xmm4,64(%ebp)
+ movdqa -128(%ebp),%xmm0
+ movdqa %xmm4,%xmm6
+ movdqa -64(%ebp),%xmm3
+ movdqa (%ebp),%xmm4
+ movdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 16
+.L008loop:
+ paddd %xmm3,%xmm0
+ movdqa %xmm3,%xmm2
+ pxor %xmm0,%xmm6
+ pshufb (%eax),%xmm6
+ paddd %xmm6,%xmm4
+ pxor %xmm4,%xmm2
+ movdqa -48(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -112(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 80(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-128(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,64(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ movdqa %xmm4,(%ebx)
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-64(%ebx)
+ paddd %xmm7,%xmm5
+ movdqa 32(%ebx),%xmm4
+ pxor %xmm5,%xmm3
+ movdqa -32(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -96(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 96(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-112(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,80(%ebx)
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ movdqa %xmm5,16(%ebx)
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-48(%ebx)
+ paddd %xmm6,%xmm4
+ movdqa 48(%ebx),%xmm5
+ pxor %xmm4,%xmm2
+ movdqa -16(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -80(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 112(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-96(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,96(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-32(%ebx)
+ paddd %xmm7,%xmm5
+ pxor %xmm5,%xmm3
+ movdqa -48(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -128(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-80(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,%xmm6
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-16(%ebx)
+ paddd %xmm6,%xmm4
+ pxor %xmm4,%xmm2
+ movdqa -32(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -112(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 64(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-128(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,112(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ movdqa %xmm4,32(%ebx)
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-48(%ebx)
+ paddd %xmm7,%xmm5
+ movdqa (%ebx),%xmm4
+ pxor %xmm5,%xmm3
+ movdqa -16(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -96(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 80(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-112(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,64(%ebx)
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ movdqa %xmm5,48(%ebx)
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-32(%ebx)
+ paddd %xmm6,%xmm4
+ movdqa 16(%ebx),%xmm5
+ pxor %xmm4,%xmm2
+ movdqa -64(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -80(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 96(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-96(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,80(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-16(%ebx)
+ paddd %xmm7,%xmm5
+ pxor %xmm5,%xmm3
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -128(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 64(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-80(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,96(%ebx)
+ pxor %xmm5,%xmm3
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ por %xmm1,%xmm3
+ decl %edx
+ jnz .L008loop
+ movdqa %xmm3,-64(%ebx)
+ movdqa %xmm4,(%ebx)
+ movdqa %xmm5,16(%ebx)
+ movdqa %xmm6,64(%ebx)
+ movdqa %xmm7,96(%ebx)
+ movdqa -112(%ebx),%xmm1
+ movdqa -96(%ebx),%xmm2
+ movdqa -80(%ebx),%xmm3
+ paddd -128(%ebp),%xmm0
+ paddd -112(%ebp),%xmm1
+ paddd -96(%ebp),%xmm2
+ paddd -80(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa -64(%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa -48(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa -32(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa -16(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd -64(%ebp),%xmm0
+ paddd -48(%ebp),%xmm1
+ paddd -32(%ebp),%xmm2
+ paddd -16(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa (%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa 16(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa 32(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa 48(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd (%ebp),%xmm0
+ paddd 16(%ebp),%xmm1
+ paddd 32(%ebp),%xmm2
+ paddd 48(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa 64(%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa 80(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa 96(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa 112(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd 64(%ebp),%xmm0
+ paddd 80(%ebp),%xmm1
+ paddd 96(%ebp),%xmm2
+ paddd 112(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 208(%esi),%esi
+ pxor %xmm0,%xmm4
+ pxor %xmm1,%xmm5
+ pxor %xmm2,%xmm6
+ pxor %xmm3,%xmm7
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L007outer_loop
+ addl $256,%ecx
+ jz .L009done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ movd 64(%ebp),%xmm2
+ movdqu (%ebx),%xmm3
+ paddd 96(%eax),%xmm2
+ pand 112(%eax),%xmm3
+ por %xmm2,%xmm3
+.L0061x:
+ movdqa 32(%eax),%xmm0
+ movdqu (%edx),%xmm1
+ movdqu 16(%edx),%xmm2
+ movdqa (%eax),%xmm6
+ movdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L010loop1x
+.align 16
+.L011outer1x:
+ movdqa 80(%eax),%xmm3
+ movdqa (%esp),%xmm0
+ movdqa 16(%esp),%xmm1
+ movdqa 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ movl $10,%edx
+ movdqa %xmm3,48(%esp)
+ jmp .L010loop1x
+.align 16
+.L010loop1x:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L010loop1x
+ paddd (%esp),%xmm0
+ paddd 16(%esp),%xmm1
+ paddd 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ cmpl $64,%ecx
+ jb .L012tail
+ movdqu (%esi),%xmm4
+ movdqu 16(%esi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%esi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%esi),%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+ leal 64(%esi),%esi
+ movdqu %xmm0,(%edi)
+ movdqu %xmm1,16(%edi)
+ movdqu %xmm2,32(%edi)
+ movdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L011outer1x
+ jmp .L009done
+.L012tail:
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L013tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L013tail_loop
+.L009done:
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_ctr32_ssse3,.-.L_ChaCha20_ctr32_ssse3_begin
+.align 64
+.Lssse3_data:
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long 1634760805,857760878,2036477234,1797285236
+.long 0,1,2,3
+.long 4,4,4,4
+.long 1,0,0,0
+.long 4,0,0,0
+.long 0,-1,-1,-1
+.align 64
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte 114,103,62,0
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/crypto/chacha-x86-win.asm b/gen/crypto/chacha-x86-win.asm
new file mode 100644
index 0000000..d709da0
--- /dev/null
+++ b/gen/crypto/chacha-x86-win.asm
@@ -0,0 +1,966 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _ChaCha20_ctr32_nohw
+align 16
+_ChaCha20_ctr32_nohw:
+L$_ChaCha20_ctr32_nohw_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov esi,DWORD [32+esp]
+ mov edi,DWORD [36+esp]
+ sub esp,132
+ mov eax,DWORD [esi]
+ mov ebx,DWORD [4+esi]
+ mov ecx,DWORD [8+esi]
+ mov edx,DWORD [12+esi]
+ mov DWORD [80+esp],eax
+ mov DWORD [84+esp],ebx
+ mov DWORD [88+esp],ecx
+ mov DWORD [92+esp],edx
+ mov eax,DWORD [16+esi]
+ mov ebx,DWORD [20+esi]
+ mov ecx,DWORD [24+esi]
+ mov edx,DWORD [28+esi]
+ mov DWORD [96+esp],eax
+ mov DWORD [100+esp],ebx
+ mov DWORD [104+esp],ecx
+ mov DWORD [108+esp],edx
+ mov eax,DWORD [edi]
+ mov ebx,DWORD [4+edi]
+ mov ecx,DWORD [8+edi]
+ mov edx,DWORD [12+edi]
+ sub eax,1
+ mov DWORD [112+esp],eax
+ mov DWORD [116+esp],ebx
+ mov DWORD [120+esp],ecx
+ mov DWORD [124+esp],edx
+ jmp NEAR L$000entry
+align 16
+L$001outer_loop:
+ mov DWORD [156+esp],ebx
+ mov DWORD [152+esp],eax
+ mov DWORD [160+esp],ecx
+L$000entry:
+ mov eax,1634760805
+ mov DWORD [4+esp],857760878
+ mov DWORD [8+esp],2036477234
+ mov DWORD [12+esp],1797285236
+ mov ebx,DWORD [84+esp]
+ mov ebp,DWORD [88+esp]
+ mov ecx,DWORD [104+esp]
+ mov esi,DWORD [108+esp]
+ mov edx,DWORD [116+esp]
+ mov edi,DWORD [120+esp]
+ mov DWORD [20+esp],ebx
+ mov DWORD [24+esp],ebp
+ mov DWORD [40+esp],ecx
+ mov DWORD [44+esp],esi
+ mov DWORD [52+esp],edx
+ mov DWORD [56+esp],edi
+ mov ebx,DWORD [92+esp]
+ mov edi,DWORD [124+esp]
+ mov edx,DWORD [112+esp]
+ mov ebp,DWORD [80+esp]
+ mov ecx,DWORD [96+esp]
+ mov esi,DWORD [100+esp]
+ add edx,1
+ mov DWORD [28+esp],ebx
+ mov DWORD [60+esp],edi
+ mov DWORD [112+esp],edx
+ mov ebx,10
+ jmp NEAR L$002loop
+align 16
+L$002loop:
+ add eax,ebp
+ mov DWORD [128+esp],ebx
+ mov ebx,ebp
+ xor edx,eax
+ rol edx,16
+ add ecx,edx
+ xor ebx,ecx
+ mov edi,DWORD [52+esp]
+ rol ebx,12
+ mov ebp,DWORD [20+esp]
+ add eax,ebx
+ xor edx,eax
+ mov DWORD [esp],eax
+ rol edx,8
+ mov eax,DWORD [4+esp]
+ add ecx,edx
+ mov DWORD [48+esp],edx
+ xor ebx,ecx
+ add eax,ebp
+ rol ebx,7
+ xor edi,eax
+ mov DWORD [32+esp],ecx
+ rol edi,16
+ mov DWORD [16+esp],ebx
+ add esi,edi
+ mov ecx,DWORD [40+esp]
+ xor ebp,esi
+ mov edx,DWORD [56+esp]
+ rol ebp,12
+ mov ebx,DWORD [24+esp]
+ add eax,ebp
+ xor edi,eax
+ mov DWORD [4+esp],eax
+ rol edi,8
+ mov eax,DWORD [8+esp]
+ add esi,edi
+ mov DWORD [52+esp],edi
+ xor ebp,esi
+ add eax,ebx
+ rol ebp,7
+ xor edx,eax
+ mov DWORD [36+esp],esi
+ rol edx,16
+ mov DWORD [20+esp],ebp
+ add ecx,edx
+ mov esi,DWORD [44+esp]
+ xor ebx,ecx
+ mov edi,DWORD [60+esp]
+ rol ebx,12
+ mov ebp,DWORD [28+esp]
+ add eax,ebx
+ xor edx,eax
+ mov DWORD [8+esp],eax
+ rol edx,8
+ mov eax,DWORD [12+esp]
+ add ecx,edx
+ mov DWORD [56+esp],edx
+ xor ebx,ecx
+ add eax,ebp
+ rol ebx,7
+ xor edi,eax
+ rol edi,16
+ mov DWORD [24+esp],ebx
+ add esi,edi
+ xor ebp,esi
+ rol ebp,12
+ mov ebx,DWORD [20+esp]
+ add eax,ebp
+ xor edi,eax
+ mov DWORD [12+esp],eax
+ rol edi,8
+ mov eax,DWORD [esp]
+ add esi,edi
+ mov edx,edi
+ xor ebp,esi
+ add eax,ebx
+ rol ebp,7
+ xor edx,eax
+ rol edx,16
+ mov DWORD [28+esp],ebp
+ add ecx,edx
+ xor ebx,ecx
+ mov edi,DWORD [48+esp]
+ rol ebx,12
+ mov ebp,DWORD [24+esp]
+ add eax,ebx
+ xor edx,eax
+ mov DWORD [esp],eax
+ rol edx,8
+ mov eax,DWORD [4+esp]
+ add ecx,edx
+ mov DWORD [60+esp],edx
+ xor ebx,ecx
+ add eax,ebp
+ rol ebx,7
+ xor edi,eax
+ mov DWORD [40+esp],ecx
+ rol edi,16
+ mov DWORD [20+esp],ebx
+ add esi,edi
+ mov ecx,DWORD [32+esp]
+ xor ebp,esi
+ mov edx,DWORD [52+esp]
+ rol ebp,12
+ mov ebx,DWORD [28+esp]
+ add eax,ebp
+ xor edi,eax
+ mov DWORD [4+esp],eax
+ rol edi,8
+ mov eax,DWORD [8+esp]
+ add esi,edi
+ mov DWORD [48+esp],edi
+ xor ebp,esi
+ add eax,ebx
+ rol ebp,7
+ xor edx,eax
+ mov DWORD [44+esp],esi
+ rol edx,16
+ mov DWORD [24+esp],ebp
+ add ecx,edx
+ mov esi,DWORD [36+esp]
+ xor ebx,ecx
+ mov edi,DWORD [56+esp]
+ rol ebx,12
+ mov ebp,DWORD [16+esp]
+ add eax,ebx
+ xor edx,eax
+ mov DWORD [8+esp],eax
+ rol edx,8
+ mov eax,DWORD [12+esp]
+ add ecx,edx
+ mov DWORD [52+esp],edx
+ xor ebx,ecx
+ add eax,ebp
+ rol ebx,7
+ xor edi,eax
+ rol edi,16
+ mov DWORD [28+esp],ebx
+ add esi,edi
+ xor ebp,esi
+ mov edx,DWORD [48+esp]
+ rol ebp,12
+ mov ebx,DWORD [128+esp]
+ add eax,ebp
+ xor edi,eax
+ mov DWORD [12+esp],eax
+ rol edi,8
+ mov eax,DWORD [esp]
+ add esi,edi
+ mov DWORD [56+esp],edi
+ xor ebp,esi
+ rol ebp,7
+ dec ebx
+ jnz NEAR L$002loop
+ mov ebx,DWORD [160+esp]
+ add eax,1634760805
+ add ebp,DWORD [80+esp]
+ add ecx,DWORD [96+esp]
+ add esi,DWORD [100+esp]
+ cmp ebx,64
+ jb NEAR L$003tail
+ mov ebx,DWORD [156+esp]
+ add edx,DWORD [112+esp]
+ add edi,DWORD [120+esp]
+ xor eax,DWORD [ebx]
+ xor ebp,DWORD [16+ebx]
+ mov DWORD [esp],eax
+ mov eax,DWORD [152+esp]
+ xor ecx,DWORD [32+ebx]
+ xor esi,DWORD [36+ebx]
+ xor edx,DWORD [48+ebx]
+ xor edi,DWORD [56+ebx]
+ mov DWORD [16+eax],ebp
+ mov DWORD [32+eax],ecx
+ mov DWORD [36+eax],esi
+ mov DWORD [48+eax],edx
+ mov DWORD [56+eax],edi
+ mov ebp,DWORD [4+esp]
+ mov ecx,DWORD [8+esp]
+ mov esi,DWORD [12+esp]
+ mov edx,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ add ebp,857760878
+ add ecx,2036477234
+ add esi,1797285236
+ add edx,DWORD [84+esp]
+ add edi,DWORD [88+esp]
+ xor ebp,DWORD [4+ebx]
+ xor ecx,DWORD [8+ebx]
+ xor esi,DWORD [12+ebx]
+ xor edx,DWORD [20+ebx]
+ xor edi,DWORD [24+ebx]
+ mov DWORD [4+eax],ebp
+ mov DWORD [8+eax],ecx
+ mov DWORD [12+eax],esi
+ mov DWORD [20+eax],edx
+ mov DWORD [24+eax],edi
+ mov ebp,DWORD [28+esp]
+ mov ecx,DWORD [40+esp]
+ mov esi,DWORD [44+esp]
+ mov edx,DWORD [52+esp]
+ mov edi,DWORD [60+esp]
+ add ebp,DWORD [92+esp]
+ add ecx,DWORD [104+esp]
+ add esi,DWORD [108+esp]
+ add edx,DWORD [116+esp]
+ add edi,DWORD [124+esp]
+ xor ebp,DWORD [28+ebx]
+ xor ecx,DWORD [40+ebx]
+ xor esi,DWORD [44+ebx]
+ xor edx,DWORD [52+ebx]
+ xor edi,DWORD [60+ebx]
+ lea ebx,[64+ebx]
+ mov DWORD [28+eax],ebp
+ mov ebp,DWORD [esp]
+ mov DWORD [40+eax],ecx
+ mov ecx,DWORD [160+esp]
+ mov DWORD [44+eax],esi
+ mov DWORD [52+eax],edx
+ mov DWORD [60+eax],edi
+ mov DWORD [eax],ebp
+ lea eax,[64+eax]
+ sub ecx,64
+ jnz NEAR L$001outer_loop
+ jmp NEAR L$004done
+L$003tail:
+ add edx,DWORD [112+esp]
+ add edi,DWORD [120+esp]
+ mov DWORD [esp],eax
+ mov DWORD [16+esp],ebp
+ mov DWORD [32+esp],ecx
+ mov DWORD [36+esp],esi
+ mov DWORD [48+esp],edx
+ mov DWORD [56+esp],edi
+ mov ebp,DWORD [4+esp]
+ mov ecx,DWORD [8+esp]
+ mov esi,DWORD [12+esp]
+ mov edx,DWORD [20+esp]
+ mov edi,DWORD [24+esp]
+ add ebp,857760878
+ add ecx,2036477234
+ add esi,1797285236
+ add edx,DWORD [84+esp]
+ add edi,DWORD [88+esp]
+ mov DWORD [4+esp],ebp
+ mov DWORD [8+esp],ecx
+ mov DWORD [12+esp],esi
+ mov DWORD [20+esp],edx
+ mov DWORD [24+esp],edi
+ mov ebp,DWORD [28+esp]
+ mov ecx,DWORD [40+esp]
+ mov esi,DWORD [44+esp]
+ mov edx,DWORD [52+esp]
+ mov edi,DWORD [60+esp]
+ add ebp,DWORD [92+esp]
+ add ecx,DWORD [104+esp]
+ add esi,DWORD [108+esp]
+ add edx,DWORD [116+esp]
+ add edi,DWORD [124+esp]
+ mov DWORD [28+esp],ebp
+ mov ebp,DWORD [156+esp]
+ mov DWORD [40+esp],ecx
+ mov ecx,DWORD [152+esp]
+ mov DWORD [44+esp],esi
+ xor esi,esi
+ mov DWORD [52+esp],edx
+ mov DWORD [60+esp],edi
+ xor eax,eax
+ xor edx,edx
+L$005tail_loop:
+ mov al,BYTE [ebp*1+esi]
+ mov dl,BYTE [esi*1+esp]
+ lea esi,[1+esi]
+ xor al,dl
+ mov BYTE [esi*1+ecx-1],al
+ dec ebx
+ jnz NEAR L$005tail_loop
+L$004done:
+ add esp,132
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _ChaCha20_ctr32_ssse3
+align 16
+_ChaCha20_ctr32_ssse3:
+L$_ChaCha20_ctr32_ssse3_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ call L$pic_point
+L$pic_point:
+ pop eax
+ mov edi,DWORD [20+esp]
+ mov esi,DWORD [24+esp]
+ mov ecx,DWORD [28+esp]
+ mov edx,DWORD [32+esp]
+ mov ebx,DWORD [36+esp]
+ mov ebp,esp
+ sub esp,524
+ and esp,-64
+ mov DWORD [512+esp],ebp
+ lea eax,[(L$ssse3_data-L$pic_point)+eax]
+ movdqu xmm3,[ebx]
+ cmp ecx,256
+ jb NEAR L$0061x
+ mov DWORD [516+esp],edx
+ mov DWORD [520+esp],ebx
+ sub ecx,256
+ lea ebp,[384+esp]
+ movdqu xmm7,[edx]
+ pshufd xmm0,xmm3,0
+ pshufd xmm1,xmm3,85
+ pshufd xmm2,xmm3,170
+ pshufd xmm3,xmm3,255
+ paddd xmm0,[48+eax]
+ pshufd xmm4,xmm7,0
+ pshufd xmm5,xmm7,85
+ psubd xmm0,[64+eax]
+ pshufd xmm6,xmm7,170
+ pshufd xmm7,xmm7,255
+ movdqa [64+ebp],xmm0
+ movdqa [80+ebp],xmm1
+ movdqa [96+ebp],xmm2
+ movdqa [112+ebp],xmm3
+ movdqu xmm3,[16+edx]
+ movdqa [ebp-64],xmm4
+ movdqa [ebp-48],xmm5
+ movdqa [ebp-32],xmm6
+ movdqa [ebp-16],xmm7
+ movdqa xmm7,[32+eax]
+ lea ebx,[128+esp]
+ pshufd xmm0,xmm3,0
+ pshufd xmm1,xmm3,85
+ pshufd xmm2,xmm3,170
+ pshufd xmm3,xmm3,255
+ pshufd xmm4,xmm7,0
+ pshufd xmm5,xmm7,85
+ pshufd xmm6,xmm7,170
+ pshufd xmm7,xmm7,255
+ movdqa [ebp],xmm0
+ movdqa [16+ebp],xmm1
+ movdqa [32+ebp],xmm2
+ movdqa [48+ebp],xmm3
+ movdqa [ebp-128],xmm4
+ movdqa [ebp-112],xmm5
+ movdqa [ebp-96],xmm6
+ movdqa [ebp-80],xmm7
+ lea esi,[128+esi]
+ lea edi,[128+edi]
+ jmp NEAR L$007outer_loop
+align 16
+L$007outer_loop:
+ movdqa xmm1,[ebp-112]
+ movdqa xmm2,[ebp-96]
+ movdqa xmm3,[ebp-80]
+ movdqa xmm5,[ebp-48]
+ movdqa xmm6,[ebp-32]
+ movdqa xmm7,[ebp-16]
+ movdqa [ebx-112],xmm1
+ movdqa [ebx-96],xmm2
+ movdqa [ebx-80],xmm3
+ movdqa [ebx-48],xmm5
+ movdqa [ebx-32],xmm6
+ movdqa [ebx-16],xmm7
+ movdqa xmm2,[32+ebp]
+ movdqa xmm3,[48+ebp]
+ movdqa xmm4,[64+ebp]
+ movdqa xmm5,[80+ebp]
+ movdqa xmm6,[96+ebp]
+ movdqa xmm7,[112+ebp]
+ paddd xmm4,[64+eax]
+ movdqa [32+ebx],xmm2
+ movdqa [48+ebx],xmm3
+ movdqa [64+ebx],xmm4
+ movdqa [80+ebx],xmm5
+ movdqa [96+ebx],xmm6
+ movdqa [112+ebx],xmm7
+ movdqa [64+ebp],xmm4
+ movdqa xmm0,[ebp-128]
+ movdqa xmm6,xmm4
+ movdqa xmm3,[ebp-64]
+ movdqa xmm4,[ebp]
+ movdqa xmm5,[16+ebp]
+ mov edx,10
+ nop
+align 16
+L$008loop:
+ paddd xmm0,xmm3
+ movdqa xmm2,xmm3
+ pxor xmm6,xmm0
+ pshufb xmm6,[eax]
+ paddd xmm4,xmm6
+ pxor xmm2,xmm4
+ movdqa xmm3,[ebx-48]
+ movdqa xmm1,xmm2
+ pslld xmm2,12
+ psrld xmm1,20
+ por xmm2,xmm1
+ movdqa xmm1,[ebx-112]
+ paddd xmm0,xmm2
+ movdqa xmm7,[80+ebx]
+ pxor xmm6,xmm0
+ movdqa [ebx-128],xmm0
+ pshufb xmm6,[16+eax]
+ paddd xmm4,xmm6
+ movdqa [64+ebx],xmm6
+ pxor xmm2,xmm4
+ paddd xmm1,xmm3
+ movdqa xmm0,xmm2
+ pslld xmm2,7
+ psrld xmm0,25
+ pxor xmm7,xmm1
+ por xmm2,xmm0
+ movdqa [ebx],xmm4
+ pshufb xmm7,[eax]
+ movdqa [ebx-64],xmm2
+ paddd xmm5,xmm7
+ movdqa xmm4,[32+ebx]
+ pxor xmm3,xmm5
+ movdqa xmm2,[ebx-32]
+ movdqa xmm0,xmm3
+ pslld xmm3,12
+ psrld xmm0,20
+ por xmm3,xmm0
+ movdqa xmm0,[ebx-96]
+ paddd xmm1,xmm3
+ movdqa xmm6,[96+ebx]
+ pxor xmm7,xmm1
+ movdqa [ebx-112],xmm1
+ pshufb xmm7,[16+eax]
+ paddd xmm5,xmm7
+ movdqa [80+ebx],xmm7
+ pxor xmm3,xmm5
+ paddd xmm0,xmm2
+ movdqa xmm1,xmm3
+ pslld xmm3,7
+ psrld xmm1,25
+ pxor xmm6,xmm0
+ por xmm3,xmm1
+ movdqa [16+ebx],xmm5
+ pshufb xmm6,[eax]
+ movdqa [ebx-48],xmm3
+ paddd xmm4,xmm6
+ movdqa xmm5,[48+ebx]
+ pxor xmm2,xmm4
+ movdqa xmm3,[ebx-16]
+ movdqa xmm1,xmm2
+ pslld xmm2,12
+ psrld xmm1,20
+ por xmm2,xmm1
+ movdqa xmm1,[ebx-80]
+ paddd xmm0,xmm2
+ movdqa xmm7,[112+ebx]
+ pxor xmm6,xmm0
+ movdqa [ebx-96],xmm0
+ pshufb xmm6,[16+eax]
+ paddd xmm4,xmm6
+ movdqa [96+ebx],xmm6
+ pxor xmm2,xmm4
+ paddd xmm1,xmm3
+ movdqa xmm0,xmm2
+ pslld xmm2,7
+ psrld xmm0,25
+ pxor xmm7,xmm1
+ por xmm2,xmm0
+ pshufb xmm7,[eax]
+ movdqa [ebx-32],xmm2
+ paddd xmm5,xmm7
+ pxor xmm3,xmm5
+ movdqa xmm2,[ebx-48]
+ movdqa xmm0,xmm3
+ pslld xmm3,12
+ psrld xmm0,20
+ por xmm3,xmm0
+ movdqa xmm0,[ebx-128]
+ paddd xmm1,xmm3
+ pxor xmm7,xmm1
+ movdqa [ebx-80],xmm1
+ pshufb xmm7,[16+eax]
+ paddd xmm5,xmm7
+ movdqa xmm6,xmm7
+ pxor xmm3,xmm5
+ paddd xmm0,xmm2
+ movdqa xmm1,xmm3
+ pslld xmm3,7
+ psrld xmm1,25
+ pxor xmm6,xmm0
+ por xmm3,xmm1
+ pshufb xmm6,[eax]
+ movdqa [ebx-16],xmm3
+ paddd xmm4,xmm6
+ pxor xmm2,xmm4
+ movdqa xmm3,[ebx-32]
+ movdqa xmm1,xmm2
+ pslld xmm2,12
+ psrld xmm1,20
+ por xmm2,xmm1
+ movdqa xmm1,[ebx-112]
+ paddd xmm0,xmm2
+ movdqa xmm7,[64+ebx]
+ pxor xmm6,xmm0
+ movdqa [ebx-128],xmm0
+ pshufb xmm6,[16+eax]
+ paddd xmm4,xmm6
+ movdqa [112+ebx],xmm6
+ pxor xmm2,xmm4
+ paddd xmm1,xmm3
+ movdqa xmm0,xmm2
+ pslld xmm2,7
+ psrld xmm0,25
+ pxor xmm7,xmm1
+ por xmm2,xmm0
+ movdqa [32+ebx],xmm4
+ pshufb xmm7,[eax]
+ movdqa [ebx-48],xmm2
+ paddd xmm5,xmm7
+ movdqa xmm4,[ebx]
+ pxor xmm3,xmm5
+ movdqa xmm2,[ebx-16]
+ movdqa xmm0,xmm3
+ pslld xmm3,12
+ psrld xmm0,20
+ por xmm3,xmm0
+ movdqa xmm0,[ebx-96]
+ paddd xmm1,xmm3
+ movdqa xmm6,[80+ebx]
+ pxor xmm7,xmm1
+ movdqa [ebx-112],xmm1
+ pshufb xmm7,[16+eax]
+ paddd xmm5,xmm7
+ movdqa [64+ebx],xmm7
+ pxor xmm3,xmm5
+ paddd xmm0,xmm2
+ movdqa xmm1,xmm3
+ pslld xmm3,7
+ psrld xmm1,25
+ pxor xmm6,xmm0
+ por xmm3,xmm1
+ movdqa [48+ebx],xmm5
+ pshufb xmm6,[eax]
+ movdqa [ebx-32],xmm3
+ paddd xmm4,xmm6
+ movdqa xmm5,[16+ebx]
+ pxor xmm2,xmm4
+ movdqa xmm3,[ebx-64]
+ movdqa xmm1,xmm2
+ pslld xmm2,12
+ psrld xmm1,20
+ por xmm2,xmm1
+ movdqa xmm1,[ebx-80]
+ paddd xmm0,xmm2
+ movdqa xmm7,[96+ebx]
+ pxor xmm6,xmm0
+ movdqa [ebx-96],xmm0
+ pshufb xmm6,[16+eax]
+ paddd xmm4,xmm6
+ movdqa [80+ebx],xmm6
+ pxor xmm2,xmm4
+ paddd xmm1,xmm3
+ movdqa xmm0,xmm2
+ pslld xmm2,7
+ psrld xmm0,25
+ pxor xmm7,xmm1
+ por xmm2,xmm0
+ pshufb xmm7,[eax]
+ movdqa [ebx-16],xmm2
+ paddd xmm5,xmm7
+ pxor xmm3,xmm5
+ movdqa xmm0,xmm3
+ pslld xmm3,12
+ psrld xmm0,20
+ por xmm3,xmm0
+ movdqa xmm0,[ebx-128]
+ paddd xmm1,xmm3
+ movdqa xmm6,[64+ebx]
+ pxor xmm7,xmm1
+ movdqa [ebx-80],xmm1
+ pshufb xmm7,[16+eax]
+ paddd xmm5,xmm7
+ movdqa [96+ebx],xmm7
+ pxor xmm3,xmm5
+ movdqa xmm1,xmm3
+ pslld xmm3,7
+ psrld xmm1,25
+ por xmm3,xmm1
+ dec edx
+ jnz NEAR L$008loop
+ movdqa [ebx-64],xmm3
+ movdqa [ebx],xmm4
+ movdqa [16+ebx],xmm5
+ movdqa [64+ebx],xmm6
+ movdqa [96+ebx],xmm7
+ movdqa xmm1,[ebx-112]
+ movdqa xmm2,[ebx-96]
+ movdqa xmm3,[ebx-80]
+ paddd xmm0,[ebp-128]
+ paddd xmm1,[ebp-112]
+ paddd xmm2,[ebp-96]
+ paddd xmm3,[ebp-80]
+ movdqa xmm6,xmm0
+ punpckldq xmm0,xmm1
+ movdqa xmm7,xmm2
+ punpckldq xmm2,xmm3
+ punpckhdq xmm6,xmm1
+ punpckhdq xmm7,xmm3
+ movdqa xmm1,xmm0
+ punpcklqdq xmm0,xmm2
+ movdqa xmm3,xmm6
+ punpcklqdq xmm6,xmm7
+ punpckhqdq xmm1,xmm2
+ punpckhqdq xmm3,xmm7
+ movdqu xmm4,[esi-128]
+ movdqu xmm5,[esi-64]
+ movdqu xmm2,[esi]
+ movdqu xmm7,[64+esi]
+ lea esi,[16+esi]
+ pxor xmm4,xmm0
+ movdqa xmm0,[ebx-64]
+ pxor xmm5,xmm1
+ movdqa xmm1,[ebx-48]
+ pxor xmm6,xmm2
+ movdqa xmm2,[ebx-32]
+ pxor xmm7,xmm3
+ movdqa xmm3,[ebx-16]
+ movdqu [edi-128],xmm4
+ movdqu [edi-64],xmm5
+ movdqu [edi],xmm6
+ movdqu [64+edi],xmm7
+ lea edi,[16+edi]
+ paddd xmm0,[ebp-64]
+ paddd xmm1,[ebp-48]
+ paddd xmm2,[ebp-32]
+ paddd xmm3,[ebp-16]
+ movdqa xmm6,xmm0
+ punpckldq xmm0,xmm1
+ movdqa xmm7,xmm2
+ punpckldq xmm2,xmm3
+ punpckhdq xmm6,xmm1
+ punpckhdq xmm7,xmm3
+ movdqa xmm1,xmm0
+ punpcklqdq xmm0,xmm2
+ movdqa xmm3,xmm6
+ punpcklqdq xmm6,xmm7
+ punpckhqdq xmm1,xmm2
+ punpckhqdq xmm3,xmm7
+ movdqu xmm4,[esi-128]
+ movdqu xmm5,[esi-64]
+ movdqu xmm2,[esi]
+ movdqu xmm7,[64+esi]
+ lea esi,[16+esi]
+ pxor xmm4,xmm0
+ movdqa xmm0,[ebx]
+ pxor xmm5,xmm1
+ movdqa xmm1,[16+ebx]
+ pxor xmm6,xmm2
+ movdqa xmm2,[32+ebx]
+ pxor xmm7,xmm3
+ movdqa xmm3,[48+ebx]
+ movdqu [edi-128],xmm4
+ movdqu [edi-64],xmm5
+ movdqu [edi],xmm6
+ movdqu [64+edi],xmm7
+ lea edi,[16+edi]
+ paddd xmm0,[ebp]
+ paddd xmm1,[16+ebp]
+ paddd xmm2,[32+ebp]
+ paddd xmm3,[48+ebp]
+ movdqa xmm6,xmm0
+ punpckldq xmm0,xmm1
+ movdqa xmm7,xmm2
+ punpckldq xmm2,xmm3
+ punpckhdq xmm6,xmm1
+ punpckhdq xmm7,xmm3
+ movdqa xmm1,xmm0
+ punpcklqdq xmm0,xmm2
+ movdqa xmm3,xmm6
+ punpcklqdq xmm6,xmm7
+ punpckhqdq xmm1,xmm2
+ punpckhqdq xmm3,xmm7
+ movdqu xmm4,[esi-128]
+ movdqu xmm5,[esi-64]
+ movdqu xmm2,[esi]
+ movdqu xmm7,[64+esi]
+ lea esi,[16+esi]
+ pxor xmm4,xmm0
+ movdqa xmm0,[64+ebx]
+ pxor xmm5,xmm1
+ movdqa xmm1,[80+ebx]
+ pxor xmm6,xmm2
+ movdqa xmm2,[96+ebx]
+ pxor xmm7,xmm3
+ movdqa xmm3,[112+ebx]
+ movdqu [edi-128],xmm4
+ movdqu [edi-64],xmm5
+ movdqu [edi],xmm6
+ movdqu [64+edi],xmm7
+ lea edi,[16+edi]
+ paddd xmm0,[64+ebp]
+ paddd xmm1,[80+ebp]
+ paddd xmm2,[96+ebp]
+ paddd xmm3,[112+ebp]
+ movdqa xmm6,xmm0
+ punpckldq xmm0,xmm1
+ movdqa xmm7,xmm2
+ punpckldq xmm2,xmm3
+ punpckhdq xmm6,xmm1
+ punpckhdq xmm7,xmm3
+ movdqa xmm1,xmm0
+ punpcklqdq xmm0,xmm2
+ movdqa xmm3,xmm6
+ punpcklqdq xmm6,xmm7
+ punpckhqdq xmm1,xmm2
+ punpckhqdq xmm3,xmm7
+ movdqu xmm4,[esi-128]
+ movdqu xmm5,[esi-64]
+ movdqu xmm2,[esi]
+ movdqu xmm7,[64+esi]
+ lea esi,[208+esi]
+ pxor xmm4,xmm0
+ pxor xmm5,xmm1
+ pxor xmm6,xmm2
+ pxor xmm7,xmm3
+ movdqu [edi-128],xmm4
+ movdqu [edi-64],xmm5
+ movdqu [edi],xmm6
+ movdqu [64+edi],xmm7
+ lea edi,[208+edi]
+ sub ecx,256
+ jnc NEAR L$007outer_loop
+ add ecx,256
+ jz NEAR L$009done
+ mov ebx,DWORD [520+esp]
+ lea esi,[esi-128]
+ mov edx,DWORD [516+esp]
+ lea edi,[edi-128]
+ movd xmm2,DWORD [64+ebp]
+ movdqu xmm3,[ebx]
+ paddd xmm2,[96+eax]
+ pand xmm3,[112+eax]
+ por xmm3,xmm2
+L$0061x:
+ movdqa xmm0,[32+eax]
+ movdqu xmm1,[edx]
+ movdqu xmm2,[16+edx]
+ movdqa xmm6,[eax]
+ movdqa xmm7,[16+eax]
+ mov DWORD [48+esp],ebp
+ movdqa [esp],xmm0
+ movdqa [16+esp],xmm1
+ movdqa [32+esp],xmm2
+ movdqa [48+esp],xmm3
+ mov edx,10
+ jmp NEAR L$010loop1x
+align 16
+L$011outer1x:
+ movdqa xmm3,[80+eax]
+ movdqa xmm0,[esp]
+ movdqa xmm1,[16+esp]
+ movdqa xmm2,[32+esp]
+ paddd xmm3,[48+esp]
+ mov edx,10
+ movdqa [48+esp],xmm3
+ jmp NEAR L$010loop1x
+align 16
+L$010loop1x:
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+db 102,15,56,0,222
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,20
+ pslld xmm4,12
+ por xmm1,xmm4
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+db 102,15,56,0,223
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,25
+ pslld xmm4,7
+ por xmm1,xmm4
+ pshufd xmm2,xmm2,78
+ pshufd xmm1,xmm1,57
+ pshufd xmm3,xmm3,147
+ nop
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+db 102,15,56,0,222
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,20
+ pslld xmm4,12
+ por xmm1,xmm4
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+db 102,15,56,0,223
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,25
+ pslld xmm4,7
+ por xmm1,xmm4
+ pshufd xmm2,xmm2,78
+ pshufd xmm1,xmm1,147
+ pshufd xmm3,xmm3,57
+ dec edx
+ jnz NEAR L$010loop1x
+ paddd xmm0,[esp]
+ paddd xmm1,[16+esp]
+ paddd xmm2,[32+esp]
+ paddd xmm3,[48+esp]
+ cmp ecx,64
+ jb NEAR L$012tail
+ movdqu xmm4,[esi]
+ movdqu xmm5,[16+esi]
+ pxor xmm0,xmm4
+ movdqu xmm4,[32+esi]
+ pxor xmm1,xmm5
+ movdqu xmm5,[48+esi]
+ pxor xmm2,xmm4
+ pxor xmm3,xmm5
+ lea esi,[64+esi]
+ movdqu [edi],xmm0
+ movdqu [16+edi],xmm1
+ movdqu [32+edi],xmm2
+ movdqu [48+edi],xmm3
+ lea edi,[64+edi]
+ sub ecx,64
+ jnz NEAR L$011outer1x
+ jmp NEAR L$009done
+L$012tail:
+ movdqa [esp],xmm0
+ movdqa [16+esp],xmm1
+ movdqa [32+esp],xmm2
+ movdqa [48+esp],xmm3
+ xor eax,eax
+ xor edx,edx
+ xor ebp,ebp
+L$013tail_loop:
+ mov al,BYTE [ebp*1+esp]
+ mov dl,BYTE [ebp*1+esi]
+ lea ebp,[1+ebp]
+ xor al,dl
+ mov BYTE [ebp*1+edi-1],al
+ dec ecx
+ jnz NEAR L$013tail_loop
+L$009done:
+ mov esp,DWORD [512+esp]
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+align 64
+L$ssse3_data:
+db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+dd 1634760805,857760878,2036477234,1797285236
+dd 0,1,2,3
+dd 4,4,4,4
+dd 1,0,0,0
+dd 4,0,0,0
+dd 0,-1,-1,-1
+align 64
+db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+db 114,103,62,0
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/chacha-x86_64-apple.S b/gen/crypto/chacha-x86_64-apple.S
new file mode 100644
index 0000000..a5e1207
--- /dev/null
+++ b/gen/crypto/chacha-x86_64-apple.S
@@ -0,0 +1,1604 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+.section __DATA,__const
+.p2align 6
+L$zero:
+.long 0,0,0,0
+L$one:
+.long 1,0,0,0
+L$inc:
+.long 0,1,2,3
+L$four:
+.long 4,4,4,4
+L$incy:
+.long 0,2,4,6,1,3,5,7
+L$eight:
+.long 8,8,8,8,8,8,8,8
+L$rot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+L$rot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+L$sigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.p2align 6
+L$zeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+L$fourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+L$incz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+L$sixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+.globl _ChaCha20_ctr32_nohw
+.private_extern _ChaCha20_ctr32_nohw
+
+.p2align 6
+_ChaCha20_ctr32_nohw:
+
+_CET_ENDBR
+ pushq %rbx
+
+ pushq %rbp
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+ subq $64+24,%rsp
+
+L$ctr32_body:
+
+
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa L$one(%rip),%xmm4
+
+
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq %rdx,%rbp
+ jmp L$oop_outer
+
+.p2align 5
+L$oop_outer:
+ movl $0x61707865,%eax
+ movl $0x3320646e,%ebx
+ movl $0x79622d32,%ecx
+ movl $0x6b206574,%edx
+ movl 16(%rsp),%r8d
+ movl 20(%rsp),%r9d
+ movl 24(%rsp),%r10d
+ movl 28(%rsp),%r11d
+ movd %xmm3,%r12d
+ movl 52(%rsp),%r13d
+ movl 56(%rsp),%r14d
+ movl 60(%rsp),%r15d
+
+ movq %rbp,64+0(%rsp)
+ movl $10,%ebp
+ movq %rsi,64+8(%rsp)
+.byte 102,72,15,126,214
+ movq %rdi,64+16(%rsp)
+ movq %rsi,%rdi
+ shrq $32,%rdi
+ jmp L$oop
+
+.p2align 5
+L$oop:
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $16,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $16,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $12,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $12,%r9d
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $8,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $8,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $7,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $7,%r9d
+ movl %esi,32(%rsp)
+ movl %edi,36(%rsp)
+ movl 40(%rsp),%esi
+ movl 44(%rsp),%edi
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $16,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $16,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $12,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $12,%r11d
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $8,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $8,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $7,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $7,%r11d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $16,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $16,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $12,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $12,%r10d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $8,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $8,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $7,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $7,%r10d
+ movl %esi,40(%rsp)
+ movl %edi,44(%rsp)
+ movl 32(%rsp),%esi
+ movl 36(%rsp),%edi
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $16,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $16,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $12,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $12,%r8d
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $8,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $8,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $7,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $7,%r8d
+ decl %ebp
+ jnz L$oop
+ movl %edi,36(%rsp)
+ movl %esi,32(%rsp)
+ movq 64(%rsp),%rbp
+ movdqa %xmm2,%xmm1
+ movq 64+8(%rsp),%rsi
+ paddd %xmm4,%xmm3
+ movq 64+16(%rsp),%rdi
+
+ addl $0x61707865,%eax
+ addl $0x3320646e,%ebx
+ addl $0x79622d32,%ecx
+ addl $0x6b206574,%edx
+ addl 16(%rsp),%r8d
+ addl 20(%rsp),%r9d
+ addl 24(%rsp),%r10d
+ addl 28(%rsp),%r11d
+ addl 48(%rsp),%r12d
+ addl 52(%rsp),%r13d
+ addl 56(%rsp),%r14d
+ addl 60(%rsp),%r15d
+ paddd 32(%rsp),%xmm1
+
+ cmpq $64,%rbp
+ jb L$tail
+
+ xorl 0(%rsi),%eax
+ xorl 4(%rsi),%ebx
+ xorl 8(%rsi),%ecx
+ xorl 12(%rsi),%edx
+ xorl 16(%rsi),%r8d
+ xorl 20(%rsi),%r9d
+ xorl 24(%rsi),%r10d
+ xorl 28(%rsi),%r11d
+ movdqu 32(%rsi),%xmm0
+ xorl 48(%rsi),%r12d
+ xorl 52(%rsi),%r13d
+ xorl 56(%rsi),%r14d
+ xorl 60(%rsi),%r15d
+ leaq 64(%rsi),%rsi
+ pxor %xmm1,%xmm0
+
+ movdqa %xmm2,32(%rsp)
+ movd %xmm3,48(%rsp)
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movl %r12d,48(%rdi)
+ movl %r13d,52(%rdi)
+ movl %r14d,56(%rdi)
+ movl %r15d,60(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rbp
+ jnz L$oop_outer
+
+ jmp L$done
+
+.p2align 4
+L$tail:
+ movl %eax,0(%rsp)
+ movl %ebx,4(%rsp)
+ xorq %rbx,%rbx
+ movl %ecx,8(%rsp)
+ movl %edx,12(%rsp)
+ movl %r8d,16(%rsp)
+ movl %r9d,20(%rsp)
+ movl %r10d,24(%rsp)
+ movl %r11d,28(%rsp)
+ movdqa %xmm1,32(%rsp)
+ movl %r12d,48(%rsp)
+ movl %r13d,52(%rsp)
+ movl %r14d,56(%rsp)
+ movl %r15d,60(%rsp)
+
+L$oop_tail:
+ movzbl (%rsi,%rbx,1),%eax
+ movzbl (%rsp,%rbx,1),%edx
+ leaq 1(%rbx),%rbx
+ xorl %edx,%eax
+ movb %al,-1(%rdi,%rbx,1)
+ decq %rbp
+ jnz L$oop_tail
+
+L$done:
+ leaq 64+24+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
+L$no_data:
+ ret
+
+
+.globl _ChaCha20_ctr32_ssse3
+.private_extern _ChaCha20_ctr32_ssse3
+
+.p2align 5
+_ChaCha20_ctr32_ssse3:
+
+_CET_ENDBR
+ movq %rsp,%r9
+
+ subq $64+8,%rsp
+ movdqa L$sigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa L$rot16(%rip),%xmm6
+ movdqa L$rot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp L$oop_ssse3
+
+.p2align 5
+L$oop_outer_ssse3:
+ movdqa L$one(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp L$oop_ssse3
+
+.p2align 5
+L$oop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz L$oop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb L$tail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz L$oop_outer_ssse3
+
+ jmp L$done_ssse3
+
+.p2align 4
+L$tail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+L$oop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz L$oop_tail_ssse3
+
+L$done_ssse3:
+ leaq (%r9),%rsp
+
+L$ssse3_epilogue:
+ ret
+
+
+.globl _ChaCha20_ctr32_ssse3_4x
+.private_extern _ChaCha20_ctr32_ssse3_4x
+
+.p2align 5
+_ChaCha20_ctr32_ssse3_4x:
+
+_CET_ENDBR
+ movq %rsp,%r9
+
+ movq %r10,%r11
+ subq $0x140+8,%rsp
+ movdqa L$sigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq L$rot16(%rip),%r10
+ leaq L$rot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd L$inc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp L$oop_enter4x
+
+.p2align 5
+L$oop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd L$four(%rip),%xmm0
+
+L$oop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r10),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp L$oop4x
+
+.p2align 5
+L$oop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,199
+.byte 102,15,56,0,207
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,215
+.byte 102,15,56,0,223
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,223
+.byte 102,15,56,0,199
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,222
+.byte 102,15,56,0,198
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz L$oop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb L$tail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz L$oop_outer4x
+
+ jmp L$done4x
+
+L$tail4x:
+ cmpq $192,%rdx
+ jae L$192_or_more4x
+ cmpq $128,%rdx
+ jae L$128_or_more4x
+ cmpq $64,%rdx
+ jae L$64_or_more4x
+
+
+ xorq %r10,%r10
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp L$oop_tail4x
+
+.p2align 5
+L$64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je L$done4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp L$oop_tail4x
+
+.p2align 5
+L$128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je L$done4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp L$oop_tail4x
+
+.p2align 5
+L$192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je L$done4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+L$oop_tail4x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz L$oop_tail4x
+
+L$done4x:
+ leaq (%r9),%rsp
+
+L$4x_epilogue:
+ ret
+
+
+.globl _ChaCha20_ctr32_avx2
+.private_extern _ChaCha20_ctr32_avx2
+
+.p2align 5
+_ChaCha20_ctr32_avx2:
+
+_CET_ENDBR
+ movq %rsp,%r9
+
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 L$sigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq L$rot16(%rip),%r10
+ leaq L$rot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd L$incy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp L$oop_enter8x
+
+.p2align 5
+L$oop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd L$eight(%rip),%ymm4,%ymm4
+
+L$oop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r10),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp L$oop8x
+
+.p2align 5
+L$oop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz L$oop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb L$tail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz L$oop_outer8x
+
+ jmp L$done8x
+
+L$tail8x:
+ cmpq $448,%rdx
+ jae L$448_or_more8x
+ cmpq $384,%rdx
+ jae L$384_or_more8x
+ cmpq $320,%rdx
+ jae L$320_or_more8x
+ cmpq $256,%rdx
+ jae L$256_or_more8x
+ cmpq $192,%rdx
+ jae L$192_or_more8x
+ cmpq $128,%rdx
+ jae L$128_or_more8x
+ cmpq $64,%rdx
+ jae L$64_or_more8x
+
+ xorq %r10,%r10
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp L$oop_tail8x
+
+.p2align 5
+L$64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je L$done8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp L$oop_tail8x
+
+.p2align 5
+L$128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je L$done8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp L$oop_tail8x
+
+.p2align 5
+L$192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je L$done8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp L$oop_tail8x
+
+.p2align 5
+L$256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je L$done8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp L$oop_tail8x
+
+.p2align 5
+L$320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je L$done8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp L$oop_tail8x
+
+.p2align 5
+L$384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je L$done8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp L$oop_tail8x
+
+.p2align 5
+L$448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je L$done8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+L$oop_tail8x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz L$oop_tail8x
+
+L$done8x:
+ vzeroall
+ leaq (%r9),%rsp
+
+L$8x_epilogue:
+ ret
+
+
+#endif
diff --git a/gen/crypto/chacha-x86_64-linux.S b/gen/crypto/chacha-x86_64-linux.S
new file mode 100644
index 0000000..9dbf7d1
--- /dev/null
+++ b/gen/crypto/chacha-x86_64-linux.S
@@ -0,0 +1,1610 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+.section .rodata
+.align 64
+.Lzero:
+.long 0,0,0,0
+.Lone:
+.long 1,0,0,0
+.Linc:
+.long 0,1,2,3
+.Lfour:
+.long 4,4,4,4
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.text
+.globl ChaCha20_ctr32_nohw
+.hidden ChaCha20_ctr32_nohw
+.type ChaCha20_ctr32_nohw,@function
+.align 64
+ChaCha20_ctr32_nohw:
+.cfi_startproc
+_CET_ENDBR
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset r15,-56
+ subq $64+24,%rsp
+.cfi_adjust_cfa_offset 88
+.Lctr32_body:
+
+
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lone(%rip),%xmm4
+
+
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq %rdx,%rbp
+ jmp .Loop_outer
+
+.align 32
+.Loop_outer:
+ movl $0x61707865,%eax
+ movl $0x3320646e,%ebx
+ movl $0x79622d32,%ecx
+ movl $0x6b206574,%edx
+ movl 16(%rsp),%r8d
+ movl 20(%rsp),%r9d
+ movl 24(%rsp),%r10d
+ movl 28(%rsp),%r11d
+ movd %xmm3,%r12d
+ movl 52(%rsp),%r13d
+ movl 56(%rsp),%r14d
+ movl 60(%rsp),%r15d
+
+ movq %rbp,64+0(%rsp)
+ movl $10,%ebp
+ movq %rsi,64+8(%rsp)
+.byte 102,72,15,126,214
+ movq %rdi,64+16(%rsp)
+ movq %rsi,%rdi
+ shrq $32,%rdi
+ jmp .Loop
+
+.align 32
+.Loop:
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $16,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $16,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $12,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $12,%r9d
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $8,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $8,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $7,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $7,%r9d
+ movl %esi,32(%rsp)
+ movl %edi,36(%rsp)
+ movl 40(%rsp),%esi
+ movl 44(%rsp),%edi
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $16,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $16,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $12,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $12,%r11d
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $8,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $8,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $7,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $7,%r11d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $16,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $16,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $12,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $12,%r10d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $8,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $8,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $7,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $7,%r10d
+ movl %esi,40(%rsp)
+ movl %edi,44(%rsp)
+ movl 32(%rsp),%esi
+ movl 36(%rsp),%edi
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $16,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $16,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $12,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $12,%r8d
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $8,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $8,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $7,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $7,%r8d
+ decl %ebp
+ jnz .Loop
+ movl %edi,36(%rsp)
+ movl %esi,32(%rsp)
+ movq 64(%rsp),%rbp
+ movdqa %xmm2,%xmm1
+ movq 64+8(%rsp),%rsi
+ paddd %xmm4,%xmm3
+ movq 64+16(%rsp),%rdi
+
+ addl $0x61707865,%eax
+ addl $0x3320646e,%ebx
+ addl $0x79622d32,%ecx
+ addl $0x6b206574,%edx
+ addl 16(%rsp),%r8d
+ addl 20(%rsp),%r9d
+ addl 24(%rsp),%r10d
+ addl 28(%rsp),%r11d
+ addl 48(%rsp),%r12d
+ addl 52(%rsp),%r13d
+ addl 56(%rsp),%r14d
+ addl 60(%rsp),%r15d
+ paddd 32(%rsp),%xmm1
+
+ cmpq $64,%rbp
+ jb .Ltail
+
+ xorl 0(%rsi),%eax
+ xorl 4(%rsi),%ebx
+ xorl 8(%rsi),%ecx
+ xorl 12(%rsi),%edx
+ xorl 16(%rsi),%r8d
+ xorl 20(%rsi),%r9d
+ xorl 24(%rsi),%r10d
+ xorl 28(%rsi),%r11d
+ movdqu 32(%rsi),%xmm0
+ xorl 48(%rsi),%r12d
+ xorl 52(%rsi),%r13d
+ xorl 56(%rsi),%r14d
+ xorl 60(%rsi),%r15d
+ leaq 64(%rsi),%rsi
+ pxor %xmm1,%xmm0
+
+ movdqa %xmm2,32(%rsp)
+ movd %xmm3,48(%rsp)
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movl %r12d,48(%rdi)
+ movl %r13d,52(%rdi)
+ movl %r14d,56(%rdi)
+ movl %r15d,60(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rbp
+ jnz .Loop_outer
+
+ jmp .Ldone
+
+.align 16
+.Ltail:
+ movl %eax,0(%rsp)
+ movl %ebx,4(%rsp)
+ xorq %rbx,%rbx
+ movl %ecx,8(%rsp)
+ movl %edx,12(%rsp)
+ movl %r8d,16(%rsp)
+ movl %r9d,20(%rsp)
+ movl %r10d,24(%rsp)
+ movl %r11d,28(%rsp)
+ movdqa %xmm1,32(%rsp)
+ movl %r12d,48(%rsp)
+ movl %r13d,52(%rsp)
+ movl %r14d,56(%rsp)
+ movl %r15d,60(%rsp)
+
+.Loop_tail:
+ movzbl (%rsi,%rbx,1),%eax
+ movzbl (%rsp,%rbx,1),%edx
+ leaq 1(%rbx),%rbx
+ xorl %edx,%eax
+ movb %al,-1(%rdi,%rbx,1)
+ decq %rbp
+ jnz .Loop_tail
+
+.Ldone:
+ leaq 64+24+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+.cfi_restore r15
+ movq -40(%rsi),%r14
+.cfi_restore r14
+ movq -32(%rsi),%r13
+.cfi_restore r13
+ movq -24(%rsi),%r12
+.cfi_restore r12
+ movq -16(%rsi),%rbp
+.cfi_restore rbp
+ movq -8(%rsi),%rbx
+.cfi_restore rbx
+ leaq (%rsi),%rsp
+.cfi_adjust_cfa_offset -136
+.Lno_data:
+ ret
+.cfi_endproc
+.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
+.globl ChaCha20_ctr32_ssse3
+.hidden ChaCha20_ctr32_ssse3
+.type ChaCha20_ctr32_ssse3,@function
+.align 32
+ChaCha20_ctr32_ssse3:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%r9
+.cfi_def_cfa_register r9
+ subq $64+8,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ leaq (%r9),%rsp
+.cfi_def_cfa_register rsp
+.Lssse3_epilogue:
+ ret
+.cfi_endproc
+.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
+.globl ChaCha20_ctr32_ssse3_4x
+.hidden ChaCha20_ctr32_ssse3_4x
+.type ChaCha20_ctr32_ssse3_4x,@function
+.align 32
+ChaCha20_ctr32_ssse3_4x:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%r9
+.cfi_def_cfa_register r9
+ movq %r10,%r11
+ subq $0x140+8,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r10),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,199
+.byte 102,15,56,0,207
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,215
+.byte 102,15,56,0,223
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,223
+.byte 102,15,56,0,199
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,222
+.byte 102,15,56,0,198
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r10,%r10
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ leaq (%r9),%rsp
+.cfi_def_cfa_register rsp
+.L4x_epilogue:
+ ret
+.cfi_endproc
+.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
+.globl ChaCha20_ctr32_avx2
+.hidden ChaCha20_ctr32_avx2
+.type ChaCha20_ctr32_avx2,@function
+.align 32
+ChaCha20_ctr32_avx2:
+.cfi_startproc
+_CET_ENDBR
+ movq %rsp,%r9
+.cfi_def_cfa_register r9
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r10),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r10,%r10
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register rsp
+.L8x_epilogue:
+ ret
+.cfi_endproc
+.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
+#endif
diff --git a/gen/crypto/chacha-x86_64-win.asm b/gen/crypto/chacha-x86_64-win.asm
new file mode 100644
index 0000000..14f2395
--- /dev/null
+++ b/gen/crypto/chacha-x86_64-win.asm
@@ -0,0 +1,1916 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+section .rdata rdata align=8
+ALIGN 64
+$L$zero:
+ DD 0,0,0,0
+$L$one:
+ DD 1,0,0,0
+$L$inc:
+ DD 0,1,2,3
+$L$four:
+ DD 4,4,4,4
+$L$incy:
+ DD 0,2,4,6,1,3,5,7
+$L$eight:
+ DD 8,8,8,8,8,8,8,8
+$L$rot16:
+ DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
+$L$rot24:
+ DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
+$L$sigma:
+ DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
+ DB 0
+ALIGN 64
+$L$zeroz:
+ DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
+$L$fourz:
+ DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
+$L$incz:
+ DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+$L$sixteen:
+ DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+ DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+ DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
+ DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
+ DB 108,46,111,114,103,62,0
+section .text
+
+global ChaCha20_ctr32_nohw
+
+ALIGN 64
+ChaCha20_ctr32_nohw:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_nohw:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ push rbx
+
+ push rbp
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+ sub rsp,64+24
+
+$L$ctr32_body:
+
+
+ movdqu xmm1,XMMWORD[rcx]
+ movdqu xmm2,XMMWORD[16+rcx]
+ movdqu xmm3,XMMWORD[r8]
+ movdqa xmm4,XMMWORD[$L$one]
+
+
+ movdqa XMMWORD[16+rsp],xmm1
+ movdqa XMMWORD[32+rsp],xmm2
+ movdqa XMMWORD[48+rsp],xmm3
+ mov rbp,rdx
+ jmp NEAR $L$oop_outer
+
+ALIGN 32
+$L$oop_outer:
+ mov eax,0x61707865
+ mov ebx,0x3320646e
+ mov ecx,0x79622d32
+ mov edx,0x6b206574
+ mov r8d,DWORD[16+rsp]
+ mov r9d,DWORD[20+rsp]
+ mov r10d,DWORD[24+rsp]
+ mov r11d,DWORD[28+rsp]
+ movd r12d,xmm3
+ mov r13d,DWORD[52+rsp]
+ mov r14d,DWORD[56+rsp]
+ mov r15d,DWORD[60+rsp]
+
+ mov QWORD[((64+0))+rsp],rbp
+ mov ebp,10
+ mov QWORD[((64+8))+rsp],rsi
+DB 102,72,15,126,214
+ mov QWORD[((64+16))+rsp],rdi
+ mov rdi,rsi
+ shr rdi,32
+ jmp NEAR $L$oop
+
+ALIGN 32
+$L$oop:
+ add eax,r8d
+ xor r12d,eax
+ rol r12d,16
+ add ebx,r9d
+ xor r13d,ebx
+ rol r13d,16
+ add esi,r12d
+ xor r8d,esi
+ rol r8d,12
+ add edi,r13d
+ xor r9d,edi
+ rol r9d,12
+ add eax,r8d
+ xor r12d,eax
+ rol r12d,8
+ add ebx,r9d
+ xor r13d,ebx
+ rol r13d,8
+ add esi,r12d
+ xor r8d,esi
+ rol r8d,7
+ add edi,r13d
+ xor r9d,edi
+ rol r9d,7
+ mov DWORD[32+rsp],esi
+ mov DWORD[36+rsp],edi
+ mov esi,DWORD[40+rsp]
+ mov edi,DWORD[44+rsp]
+ add ecx,r10d
+ xor r14d,ecx
+ rol r14d,16
+ add edx,r11d
+ xor r15d,edx
+ rol r15d,16
+ add esi,r14d
+ xor r10d,esi
+ rol r10d,12
+ add edi,r15d
+ xor r11d,edi
+ rol r11d,12
+ add ecx,r10d
+ xor r14d,ecx
+ rol r14d,8
+ add edx,r11d
+ xor r15d,edx
+ rol r15d,8
+ add esi,r14d
+ xor r10d,esi
+ rol r10d,7
+ add edi,r15d
+ xor r11d,edi
+ rol r11d,7
+ add eax,r9d
+ xor r15d,eax
+ rol r15d,16
+ add ebx,r10d
+ xor r12d,ebx
+ rol r12d,16
+ add esi,r15d
+ xor r9d,esi
+ rol r9d,12
+ add edi,r12d
+ xor r10d,edi
+ rol r10d,12
+ add eax,r9d
+ xor r15d,eax
+ rol r15d,8
+ add ebx,r10d
+ xor r12d,ebx
+ rol r12d,8
+ add esi,r15d
+ xor r9d,esi
+ rol r9d,7
+ add edi,r12d
+ xor r10d,edi
+ rol r10d,7
+ mov DWORD[40+rsp],esi
+ mov DWORD[44+rsp],edi
+ mov esi,DWORD[32+rsp]
+ mov edi,DWORD[36+rsp]
+ add ecx,r11d
+ xor r13d,ecx
+ rol r13d,16
+ add edx,r8d
+ xor r14d,edx
+ rol r14d,16
+ add esi,r13d
+ xor r11d,esi
+ rol r11d,12
+ add edi,r14d
+ xor r8d,edi
+ rol r8d,12
+ add ecx,r11d
+ xor r13d,ecx
+ rol r13d,8
+ add edx,r8d
+ xor r14d,edx
+ rol r14d,8
+ add esi,r13d
+ xor r11d,esi
+ rol r11d,7
+ add edi,r14d
+ xor r8d,edi
+ rol r8d,7
+ dec ebp
+ jnz NEAR $L$oop
+ mov DWORD[36+rsp],edi
+ mov DWORD[32+rsp],esi
+ mov rbp,QWORD[64+rsp]
+ movdqa xmm1,xmm2
+ mov rsi,QWORD[((64+8))+rsp]
+ paddd xmm3,xmm4
+ mov rdi,QWORD[((64+16))+rsp]
+
+ add eax,0x61707865
+ add ebx,0x3320646e
+ add ecx,0x79622d32
+ add edx,0x6b206574
+ add r8d,DWORD[16+rsp]
+ add r9d,DWORD[20+rsp]
+ add r10d,DWORD[24+rsp]
+ add r11d,DWORD[28+rsp]
+ add r12d,DWORD[48+rsp]
+ add r13d,DWORD[52+rsp]
+ add r14d,DWORD[56+rsp]
+ add r15d,DWORD[60+rsp]
+ paddd xmm1,XMMWORD[32+rsp]
+
+ cmp rbp,64
+ jb NEAR $L$tail
+
+ xor eax,DWORD[rsi]
+ xor ebx,DWORD[4+rsi]
+ xor ecx,DWORD[8+rsi]
+ xor edx,DWORD[12+rsi]
+ xor r8d,DWORD[16+rsi]
+ xor r9d,DWORD[20+rsi]
+ xor r10d,DWORD[24+rsi]
+ xor r11d,DWORD[28+rsi]
+ movdqu xmm0,XMMWORD[32+rsi]
+ xor r12d,DWORD[48+rsi]
+ xor r13d,DWORD[52+rsi]
+ xor r14d,DWORD[56+rsi]
+ xor r15d,DWORD[60+rsi]
+ lea rsi,[64+rsi]
+ pxor xmm0,xmm1
+
+ movdqa XMMWORD[32+rsp],xmm2
+ movd DWORD[48+rsp],xmm3
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ movdqu XMMWORD[32+rdi],xmm0
+ mov DWORD[48+rdi],r12d
+ mov DWORD[52+rdi],r13d
+ mov DWORD[56+rdi],r14d
+ mov DWORD[60+rdi],r15d
+ lea rdi,[64+rdi]
+
+ sub rbp,64
+ jnz NEAR $L$oop_outer
+
+ jmp NEAR $L$done
+
+ALIGN 16
+$L$tail:
+ mov DWORD[rsp],eax
+ mov DWORD[4+rsp],ebx
+ xor rbx,rbx
+ mov DWORD[8+rsp],ecx
+ mov DWORD[12+rsp],edx
+ mov DWORD[16+rsp],r8d
+ mov DWORD[20+rsp],r9d
+ mov DWORD[24+rsp],r10d
+ mov DWORD[28+rsp],r11d
+ movdqa XMMWORD[32+rsp],xmm1
+ mov DWORD[48+rsp],r12d
+ mov DWORD[52+rsp],r13d
+ mov DWORD[56+rsp],r14d
+ mov DWORD[60+rsp],r15d
+
+$L$oop_tail:
+ movzx eax,BYTE[rbx*1+rsi]
+ movzx edx,BYTE[rbx*1+rsp]
+ lea rbx,[1+rbx]
+ xor eax,edx
+ mov BYTE[((-1))+rbx*1+rdi],al
+ dec rbp
+ jnz NEAR $L$oop_tail
+
+$L$done:
+ lea rsi,[((64+24+48))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
+$L$no_data:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ChaCha20_ctr32_nohw:
+global ChaCha20_ctr32_ssse3
+
+ALIGN 32
+ChaCha20_ctr32_ssse3:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_ssse3:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ mov r9,rsp
+
+ sub rsp,64+40
+ movaps XMMWORD[(-40)+r9],xmm6
+ movaps XMMWORD[(-24)+r9],xmm7
+$L$ssse3_body:
+ movdqa xmm0,XMMWORD[$L$sigma]
+ movdqu xmm1,XMMWORD[rcx]
+ movdqu xmm2,XMMWORD[16+rcx]
+ movdqu xmm3,XMMWORD[r8]
+ movdqa xmm6,XMMWORD[$L$rot16]
+ movdqa xmm7,XMMWORD[$L$rot24]
+
+ movdqa XMMWORD[rsp],xmm0
+ movdqa XMMWORD[16+rsp],xmm1
+ movdqa XMMWORD[32+rsp],xmm2
+ movdqa XMMWORD[48+rsp],xmm3
+ mov r8,10
+ jmp NEAR $L$oop_ssse3
+
+ALIGN 32
+$L$oop_outer_ssse3:
+ movdqa xmm3,XMMWORD[$L$one]
+ movdqa xmm0,XMMWORD[rsp]
+ movdqa xmm1,XMMWORD[16+rsp]
+ movdqa xmm2,XMMWORD[32+rsp]
+ paddd xmm3,XMMWORD[48+rsp]
+ mov r8,10
+ movdqa XMMWORD[48+rsp],xmm3
+ jmp NEAR $L$oop_ssse3
+
+ALIGN 32
+$L$oop_ssse3:
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+DB 102,15,56,0,222
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,20
+ pslld xmm4,12
+ por xmm1,xmm4
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+DB 102,15,56,0,223
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,25
+ pslld xmm4,7
+ por xmm1,xmm4
+ pshufd xmm2,xmm2,78
+ pshufd xmm1,xmm1,57
+ pshufd xmm3,xmm3,147
+ nop
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+DB 102,15,56,0,222
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,20
+ pslld xmm4,12
+ por xmm1,xmm4
+ paddd xmm0,xmm1
+ pxor xmm3,xmm0
+DB 102,15,56,0,223
+ paddd xmm2,xmm3
+ pxor xmm1,xmm2
+ movdqa xmm4,xmm1
+ psrld xmm1,25
+ pslld xmm4,7
+ por xmm1,xmm4
+ pshufd xmm2,xmm2,78
+ pshufd xmm1,xmm1,147
+ pshufd xmm3,xmm3,57
+ dec r8
+ jnz NEAR $L$oop_ssse3
+ paddd xmm0,XMMWORD[rsp]
+ paddd xmm1,XMMWORD[16+rsp]
+ paddd xmm2,XMMWORD[32+rsp]
+ paddd xmm3,XMMWORD[48+rsp]
+
+ cmp rdx,64
+ jb NEAR $L$tail_ssse3
+
+ movdqu xmm4,XMMWORD[rsi]
+ movdqu xmm5,XMMWORD[16+rsi]
+ pxor xmm0,xmm4
+ movdqu xmm4,XMMWORD[32+rsi]
+ pxor xmm1,xmm5
+ movdqu xmm5,XMMWORD[48+rsi]
+ lea rsi,[64+rsi]
+ pxor xmm2,xmm4
+ pxor xmm3,xmm5
+
+ movdqu XMMWORD[rdi],xmm0
+ movdqu XMMWORD[16+rdi],xmm1
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu XMMWORD[48+rdi],xmm3
+ lea rdi,[64+rdi]
+
+ sub rdx,64
+ jnz NEAR $L$oop_outer_ssse3
+
+ jmp NEAR $L$done_ssse3
+
+ALIGN 16
+$L$tail_ssse3:
+ movdqa XMMWORD[rsp],xmm0
+ movdqa XMMWORD[16+rsp],xmm1
+ movdqa XMMWORD[32+rsp],xmm2
+ movdqa XMMWORD[48+rsp],xmm3
+ xor r8,r8
+
+$L$oop_tail_ssse3:
+ movzx eax,BYTE[r8*1+rsi]
+ movzx ecx,BYTE[r8*1+rsp]
+ lea r8,[1+r8]
+ xor eax,ecx
+ mov BYTE[((-1))+r8*1+rdi],al
+ dec rdx
+ jnz NEAR $L$oop_tail_ssse3
+
+$L$done_ssse3:
+ movaps xmm6,XMMWORD[((-40))+r9]
+ movaps xmm7,XMMWORD[((-24))+r9]
+ lea rsp,[r9]
+
+$L$ssse3_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ChaCha20_ctr32_ssse3:
+global ChaCha20_ctr32_ssse3_4x
+
+ALIGN 32
+ChaCha20_ctr32_ssse3_4x:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_ssse3_4x:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ mov r9,rsp
+
+ mov r11,r10
+ sub rsp,0x140+168
+ movaps XMMWORD[(-168)+r9],xmm6
+ movaps XMMWORD[(-152)+r9],xmm7
+ movaps XMMWORD[(-136)+r9],xmm8
+ movaps XMMWORD[(-120)+r9],xmm9
+ movaps XMMWORD[(-104)+r9],xmm10
+ movaps XMMWORD[(-88)+r9],xmm11
+ movaps XMMWORD[(-72)+r9],xmm12
+ movaps XMMWORD[(-56)+r9],xmm13
+ movaps XMMWORD[(-40)+r9],xmm14
+ movaps XMMWORD[(-24)+r9],xmm15
+$L$4x_body:
+ movdqa xmm11,XMMWORD[$L$sigma]
+ movdqu xmm15,XMMWORD[rcx]
+ movdqu xmm7,XMMWORD[16+rcx]
+ movdqu xmm3,XMMWORD[r8]
+ lea rcx,[256+rsp]
+ lea r10,[$L$rot16]
+ lea r11,[$L$rot24]
+
+ pshufd xmm8,xmm11,0x00
+ pshufd xmm9,xmm11,0x55
+ movdqa XMMWORD[64+rsp],xmm8
+ pshufd xmm10,xmm11,0xaa
+ movdqa XMMWORD[80+rsp],xmm9
+ pshufd xmm11,xmm11,0xff
+ movdqa XMMWORD[96+rsp],xmm10
+ movdqa XMMWORD[112+rsp],xmm11
+
+ pshufd xmm12,xmm15,0x00
+ pshufd xmm13,xmm15,0x55
+ movdqa XMMWORD[(128-256)+rcx],xmm12
+ pshufd xmm14,xmm15,0xaa
+ movdqa XMMWORD[(144-256)+rcx],xmm13
+ pshufd xmm15,xmm15,0xff
+ movdqa XMMWORD[(160-256)+rcx],xmm14
+ movdqa XMMWORD[(176-256)+rcx],xmm15
+
+ pshufd xmm4,xmm7,0x00
+ pshufd xmm5,xmm7,0x55
+ movdqa XMMWORD[(192-256)+rcx],xmm4
+ pshufd xmm6,xmm7,0xaa
+ movdqa XMMWORD[(208-256)+rcx],xmm5
+ pshufd xmm7,xmm7,0xff
+ movdqa XMMWORD[(224-256)+rcx],xmm6
+ movdqa XMMWORD[(240-256)+rcx],xmm7
+
+ pshufd xmm0,xmm3,0x00
+ pshufd xmm1,xmm3,0x55
+ paddd xmm0,XMMWORD[$L$inc]
+ pshufd xmm2,xmm3,0xaa
+ movdqa XMMWORD[(272-256)+rcx],xmm1
+ pshufd xmm3,xmm3,0xff
+ movdqa XMMWORD[(288-256)+rcx],xmm2
+ movdqa XMMWORD[(304-256)+rcx],xmm3
+
+ jmp NEAR $L$oop_enter4x
+
+ALIGN 32
+$L$oop_outer4x:
+ movdqa xmm8,XMMWORD[64+rsp]
+ movdqa xmm9,XMMWORD[80+rsp]
+ movdqa xmm10,XMMWORD[96+rsp]
+ movdqa xmm11,XMMWORD[112+rsp]
+ movdqa xmm12,XMMWORD[((128-256))+rcx]
+ movdqa xmm13,XMMWORD[((144-256))+rcx]
+ movdqa xmm14,XMMWORD[((160-256))+rcx]
+ movdqa xmm15,XMMWORD[((176-256))+rcx]
+ movdqa xmm4,XMMWORD[((192-256))+rcx]
+ movdqa xmm5,XMMWORD[((208-256))+rcx]
+ movdqa xmm6,XMMWORD[((224-256))+rcx]
+ movdqa xmm7,XMMWORD[((240-256))+rcx]
+ movdqa xmm0,XMMWORD[((256-256))+rcx]
+ movdqa xmm1,XMMWORD[((272-256))+rcx]
+ movdqa xmm2,XMMWORD[((288-256))+rcx]
+ movdqa xmm3,XMMWORD[((304-256))+rcx]
+ paddd xmm0,XMMWORD[$L$four]
+
+$L$oop_enter4x:
+ movdqa XMMWORD[32+rsp],xmm6
+ movdqa XMMWORD[48+rsp],xmm7
+ movdqa xmm7,XMMWORD[r10]
+ mov eax,10
+ movdqa XMMWORD[(256-256)+rcx],xmm0
+ jmp NEAR $L$oop4x
+
+ALIGN 32
+$L$oop4x:
+ paddd xmm8,xmm12
+ paddd xmm9,xmm13
+ pxor xmm0,xmm8
+ pxor xmm1,xmm9
+DB 102,15,56,0,199
+DB 102,15,56,0,207
+ paddd xmm4,xmm0
+ paddd xmm5,xmm1
+ pxor xmm12,xmm4
+ pxor xmm13,xmm5
+ movdqa xmm6,xmm12
+ pslld xmm12,12
+ psrld xmm6,20
+ movdqa xmm7,xmm13
+ pslld xmm13,12
+ por xmm12,xmm6
+ psrld xmm7,20
+ movdqa xmm6,XMMWORD[r11]
+ por xmm13,xmm7
+ paddd xmm8,xmm12
+ paddd xmm9,xmm13
+ pxor xmm0,xmm8
+ pxor xmm1,xmm9
+DB 102,15,56,0,198
+DB 102,15,56,0,206
+ paddd xmm4,xmm0
+ paddd xmm5,xmm1
+ pxor xmm12,xmm4
+ pxor xmm13,xmm5
+ movdqa xmm7,xmm12
+ pslld xmm12,7
+ psrld xmm7,25
+ movdqa xmm6,xmm13
+ pslld xmm13,7
+ por xmm12,xmm7
+ psrld xmm6,25
+ movdqa xmm7,XMMWORD[r10]
+ por xmm13,xmm6
+ movdqa XMMWORD[rsp],xmm4
+ movdqa XMMWORD[16+rsp],xmm5
+ movdqa xmm4,XMMWORD[32+rsp]
+ movdqa xmm5,XMMWORD[48+rsp]
+ paddd xmm10,xmm14
+ paddd xmm11,xmm15
+ pxor xmm2,xmm10
+ pxor xmm3,xmm11
+DB 102,15,56,0,215
+DB 102,15,56,0,223
+ paddd xmm4,xmm2
+ paddd xmm5,xmm3
+ pxor xmm14,xmm4
+ pxor xmm15,xmm5
+ movdqa xmm6,xmm14
+ pslld xmm14,12
+ psrld xmm6,20
+ movdqa xmm7,xmm15
+ pslld xmm15,12
+ por xmm14,xmm6
+ psrld xmm7,20
+ movdqa xmm6,XMMWORD[r11]
+ por xmm15,xmm7
+ paddd xmm10,xmm14
+ paddd xmm11,xmm15
+ pxor xmm2,xmm10
+ pxor xmm3,xmm11
+DB 102,15,56,0,214
+DB 102,15,56,0,222
+ paddd xmm4,xmm2
+ paddd xmm5,xmm3
+ pxor xmm14,xmm4
+ pxor xmm15,xmm5
+ movdqa xmm7,xmm14
+ pslld xmm14,7
+ psrld xmm7,25
+ movdqa xmm6,xmm15
+ pslld xmm15,7
+ por xmm14,xmm7
+ psrld xmm6,25
+ movdqa xmm7,XMMWORD[r10]
+ por xmm15,xmm6
+ paddd xmm8,xmm13
+ paddd xmm9,xmm14
+ pxor xmm3,xmm8
+ pxor xmm0,xmm9
+DB 102,15,56,0,223
+DB 102,15,56,0,199
+ paddd xmm4,xmm3
+ paddd xmm5,xmm0
+ pxor xmm13,xmm4
+ pxor xmm14,xmm5
+ movdqa xmm6,xmm13
+ pslld xmm13,12
+ psrld xmm6,20
+ movdqa xmm7,xmm14
+ pslld xmm14,12
+ por xmm13,xmm6
+ psrld xmm7,20
+ movdqa xmm6,XMMWORD[r11]
+ por xmm14,xmm7
+ paddd xmm8,xmm13
+ paddd xmm9,xmm14
+ pxor xmm3,xmm8
+ pxor xmm0,xmm9
+DB 102,15,56,0,222
+DB 102,15,56,0,198
+ paddd xmm4,xmm3
+ paddd xmm5,xmm0
+ pxor xmm13,xmm4
+ pxor xmm14,xmm5
+ movdqa xmm7,xmm13
+ pslld xmm13,7
+ psrld xmm7,25
+ movdqa xmm6,xmm14
+ pslld xmm14,7
+ por xmm13,xmm7
+ psrld xmm6,25
+ movdqa xmm7,XMMWORD[r10]
+ por xmm14,xmm6
+ movdqa XMMWORD[32+rsp],xmm4
+ movdqa XMMWORD[48+rsp],xmm5
+ movdqa xmm4,XMMWORD[rsp]
+ movdqa xmm5,XMMWORD[16+rsp]
+ paddd xmm10,xmm15
+ paddd xmm11,xmm12
+ pxor xmm1,xmm10
+ pxor xmm2,xmm11
+DB 102,15,56,0,207
+DB 102,15,56,0,215
+ paddd xmm4,xmm1
+ paddd xmm5,xmm2
+ pxor xmm15,xmm4
+ pxor xmm12,xmm5
+ movdqa xmm6,xmm15
+ pslld xmm15,12
+ psrld xmm6,20
+ movdqa xmm7,xmm12
+ pslld xmm12,12
+ por xmm15,xmm6
+ psrld xmm7,20
+ movdqa xmm6,XMMWORD[r11]
+ por xmm12,xmm7
+ paddd xmm10,xmm15
+ paddd xmm11,xmm12
+ pxor xmm1,xmm10
+ pxor xmm2,xmm11
+DB 102,15,56,0,206
+DB 102,15,56,0,214
+ paddd xmm4,xmm1
+ paddd xmm5,xmm2
+ pxor xmm15,xmm4
+ pxor xmm12,xmm5
+ movdqa xmm7,xmm15
+ pslld xmm15,7
+ psrld xmm7,25
+ movdqa xmm6,xmm12
+ pslld xmm12,7
+ por xmm15,xmm7
+ psrld xmm6,25
+ movdqa xmm7,XMMWORD[r10]
+ por xmm12,xmm6
+ dec eax
+ jnz NEAR $L$oop4x
+
+ paddd xmm8,XMMWORD[64+rsp]
+ paddd xmm9,XMMWORD[80+rsp]
+ paddd xmm10,XMMWORD[96+rsp]
+ paddd xmm11,XMMWORD[112+rsp]
+
+ movdqa xmm6,xmm8
+ punpckldq xmm8,xmm9
+ movdqa xmm7,xmm10
+ punpckldq xmm10,xmm11
+ punpckhdq xmm6,xmm9
+ punpckhdq xmm7,xmm11
+ movdqa xmm9,xmm8
+ punpcklqdq xmm8,xmm10
+ movdqa xmm11,xmm6
+ punpcklqdq xmm6,xmm7
+ punpckhqdq xmm9,xmm10
+ punpckhqdq xmm11,xmm7
+ paddd xmm12,XMMWORD[((128-256))+rcx]
+ paddd xmm13,XMMWORD[((144-256))+rcx]
+ paddd xmm14,XMMWORD[((160-256))+rcx]
+ paddd xmm15,XMMWORD[((176-256))+rcx]
+
+ movdqa XMMWORD[rsp],xmm8
+ movdqa XMMWORD[16+rsp],xmm9
+ movdqa xmm8,XMMWORD[32+rsp]
+ movdqa xmm9,XMMWORD[48+rsp]
+
+ movdqa xmm10,xmm12
+ punpckldq xmm12,xmm13
+ movdqa xmm7,xmm14
+ punpckldq xmm14,xmm15
+ punpckhdq xmm10,xmm13
+ punpckhdq xmm7,xmm15
+ movdqa xmm13,xmm12
+ punpcklqdq xmm12,xmm14
+ movdqa xmm15,xmm10
+ punpcklqdq xmm10,xmm7
+ punpckhqdq xmm13,xmm14
+ punpckhqdq xmm15,xmm7
+ paddd xmm4,XMMWORD[((192-256))+rcx]
+ paddd xmm5,XMMWORD[((208-256))+rcx]
+ paddd xmm8,XMMWORD[((224-256))+rcx]
+ paddd xmm9,XMMWORD[((240-256))+rcx]
+
+ movdqa XMMWORD[32+rsp],xmm6
+ movdqa XMMWORD[48+rsp],xmm11
+
+ movdqa xmm14,xmm4
+ punpckldq xmm4,xmm5
+ movdqa xmm7,xmm8
+ punpckldq xmm8,xmm9
+ punpckhdq xmm14,xmm5
+ punpckhdq xmm7,xmm9
+ movdqa xmm5,xmm4
+ punpcklqdq xmm4,xmm8
+ movdqa xmm9,xmm14
+ punpcklqdq xmm14,xmm7
+ punpckhqdq xmm5,xmm8
+ punpckhqdq xmm9,xmm7
+ paddd xmm0,XMMWORD[((256-256))+rcx]
+ paddd xmm1,XMMWORD[((272-256))+rcx]
+ paddd xmm2,XMMWORD[((288-256))+rcx]
+ paddd xmm3,XMMWORD[((304-256))+rcx]
+
+ movdqa xmm8,xmm0
+ punpckldq xmm0,xmm1
+ movdqa xmm7,xmm2
+ punpckldq xmm2,xmm3
+ punpckhdq xmm8,xmm1
+ punpckhdq xmm7,xmm3
+ movdqa xmm1,xmm0
+ punpcklqdq xmm0,xmm2
+ movdqa xmm3,xmm8
+ punpcklqdq xmm8,xmm7
+ punpckhqdq xmm1,xmm2
+ punpckhqdq xmm3,xmm7
+ cmp rdx,64*4
+ jb NEAR $L$tail4x
+
+ movdqu xmm6,XMMWORD[rsi]
+ movdqu xmm11,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm7,XMMWORD[48+rsi]
+ pxor xmm6,XMMWORD[rsp]
+ pxor xmm11,xmm12
+ pxor xmm2,xmm4
+ pxor xmm7,xmm0
+
+ movdqu XMMWORD[rdi],xmm6
+ movdqu xmm6,XMMWORD[64+rsi]
+ movdqu XMMWORD[16+rdi],xmm11
+ movdqu xmm11,XMMWORD[80+rsi]
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu xmm2,XMMWORD[96+rsi]
+ movdqu XMMWORD[48+rdi],xmm7
+ movdqu xmm7,XMMWORD[112+rsi]
+ lea rsi,[128+rsi]
+ pxor xmm6,XMMWORD[16+rsp]
+ pxor xmm11,xmm13
+ pxor xmm2,xmm5
+ pxor xmm7,xmm1
+
+ movdqu XMMWORD[64+rdi],xmm6
+ movdqu xmm6,XMMWORD[rsi]
+ movdqu XMMWORD[80+rdi],xmm11
+ movdqu xmm11,XMMWORD[16+rsi]
+ movdqu XMMWORD[96+rdi],xmm2
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu XMMWORD[112+rdi],xmm7
+ lea rdi,[128+rdi]
+ movdqu xmm7,XMMWORD[48+rsi]
+ pxor xmm6,XMMWORD[32+rsp]
+ pxor xmm11,xmm10
+ pxor xmm2,xmm14
+ pxor xmm7,xmm8
+
+ movdqu XMMWORD[rdi],xmm6
+ movdqu xmm6,XMMWORD[64+rsi]
+ movdqu XMMWORD[16+rdi],xmm11
+ movdqu xmm11,XMMWORD[80+rsi]
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu xmm2,XMMWORD[96+rsi]
+ movdqu XMMWORD[48+rdi],xmm7
+ movdqu xmm7,XMMWORD[112+rsi]
+ lea rsi,[128+rsi]
+ pxor xmm6,XMMWORD[48+rsp]
+ pxor xmm11,xmm15
+ pxor xmm2,xmm9
+ pxor xmm7,xmm3
+ movdqu XMMWORD[64+rdi],xmm6
+ movdqu XMMWORD[80+rdi],xmm11
+ movdqu XMMWORD[96+rdi],xmm2
+ movdqu XMMWORD[112+rdi],xmm7
+ lea rdi,[128+rdi]
+
+ sub rdx,64*4
+ jnz NEAR $L$oop_outer4x
+
+ jmp NEAR $L$done4x
+
+$L$tail4x:
+ cmp rdx,192
+ jae NEAR $L$192_or_more4x
+ cmp rdx,128
+ jae NEAR $L$128_or_more4x
+ cmp rdx,64
+ jae NEAR $L$64_or_more4x
+
+
+ xor r10,r10
+
+ movdqa XMMWORD[16+rsp],xmm12
+ movdqa XMMWORD[32+rsp],xmm4
+ movdqa XMMWORD[48+rsp],xmm0
+ jmp NEAR $L$oop_tail4x
+
+ALIGN 32
+$L$64_or_more4x:
+ movdqu xmm6,XMMWORD[rsi]
+ movdqu xmm11,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm7,XMMWORD[48+rsi]
+ pxor xmm6,XMMWORD[rsp]
+ pxor xmm11,xmm12
+ pxor xmm2,xmm4
+ pxor xmm7,xmm0
+ movdqu XMMWORD[rdi],xmm6
+ movdqu XMMWORD[16+rdi],xmm11
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu XMMWORD[48+rdi],xmm7
+ je NEAR $L$done4x
+
+ movdqa xmm6,XMMWORD[16+rsp]
+ lea rsi,[64+rsi]
+ xor r10,r10
+ movdqa XMMWORD[rsp],xmm6
+ movdqa XMMWORD[16+rsp],xmm13
+ lea rdi,[64+rdi]
+ movdqa XMMWORD[32+rsp],xmm5
+ sub rdx,64
+ movdqa XMMWORD[48+rsp],xmm1
+ jmp NEAR $L$oop_tail4x
+
+ALIGN 32
+$L$128_or_more4x:
+ movdqu xmm6,XMMWORD[rsi]
+ movdqu xmm11,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm7,XMMWORD[48+rsi]
+ pxor xmm6,XMMWORD[rsp]
+ pxor xmm11,xmm12
+ pxor xmm2,xmm4
+ pxor xmm7,xmm0
+
+ movdqu XMMWORD[rdi],xmm6
+ movdqu xmm6,XMMWORD[64+rsi]
+ movdqu XMMWORD[16+rdi],xmm11
+ movdqu xmm11,XMMWORD[80+rsi]
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu xmm2,XMMWORD[96+rsi]
+ movdqu XMMWORD[48+rdi],xmm7
+ movdqu xmm7,XMMWORD[112+rsi]
+ pxor xmm6,XMMWORD[16+rsp]
+ pxor xmm11,xmm13
+ pxor xmm2,xmm5
+ pxor xmm7,xmm1
+ movdqu XMMWORD[64+rdi],xmm6
+ movdqu XMMWORD[80+rdi],xmm11
+ movdqu XMMWORD[96+rdi],xmm2
+ movdqu XMMWORD[112+rdi],xmm7
+ je NEAR $L$done4x
+
+ movdqa xmm6,XMMWORD[32+rsp]
+ lea rsi,[128+rsi]
+ xor r10,r10
+ movdqa XMMWORD[rsp],xmm6
+ movdqa XMMWORD[16+rsp],xmm10
+ lea rdi,[128+rdi]
+ movdqa XMMWORD[32+rsp],xmm14
+ sub rdx,128
+ movdqa XMMWORD[48+rsp],xmm8
+ jmp NEAR $L$oop_tail4x
+
+ALIGN 32
+$L$192_or_more4x:
+ movdqu xmm6,XMMWORD[rsi]
+ movdqu xmm11,XMMWORD[16+rsi]
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu xmm7,XMMWORD[48+rsi]
+ pxor xmm6,XMMWORD[rsp]
+ pxor xmm11,xmm12
+ pxor xmm2,xmm4
+ pxor xmm7,xmm0
+
+ movdqu XMMWORD[rdi],xmm6
+ movdqu xmm6,XMMWORD[64+rsi]
+ movdqu XMMWORD[16+rdi],xmm11
+ movdqu xmm11,XMMWORD[80+rsi]
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu xmm2,XMMWORD[96+rsi]
+ movdqu XMMWORD[48+rdi],xmm7
+ movdqu xmm7,XMMWORD[112+rsi]
+ lea rsi,[128+rsi]
+ pxor xmm6,XMMWORD[16+rsp]
+ pxor xmm11,xmm13
+ pxor xmm2,xmm5
+ pxor xmm7,xmm1
+
+ movdqu XMMWORD[64+rdi],xmm6
+ movdqu xmm6,XMMWORD[rsi]
+ movdqu XMMWORD[80+rdi],xmm11
+ movdqu xmm11,XMMWORD[16+rsi]
+ movdqu XMMWORD[96+rdi],xmm2
+ movdqu xmm2,XMMWORD[32+rsi]
+ movdqu XMMWORD[112+rdi],xmm7
+ lea rdi,[128+rdi]
+ movdqu xmm7,XMMWORD[48+rsi]
+ pxor xmm6,XMMWORD[32+rsp]
+ pxor xmm11,xmm10
+ pxor xmm2,xmm14
+ pxor xmm7,xmm8
+ movdqu XMMWORD[rdi],xmm6
+ movdqu XMMWORD[16+rdi],xmm11
+ movdqu XMMWORD[32+rdi],xmm2
+ movdqu XMMWORD[48+rdi],xmm7
+ je NEAR $L$done4x
+
+ movdqa xmm6,XMMWORD[48+rsp]
+ lea rsi,[64+rsi]
+ xor r10,r10
+ movdqa XMMWORD[rsp],xmm6
+ movdqa XMMWORD[16+rsp],xmm15
+ lea rdi,[64+rdi]
+ movdqa XMMWORD[32+rsp],xmm9
+ sub rdx,192
+ movdqa XMMWORD[48+rsp],xmm3
+
+$L$oop_tail4x:
+ movzx eax,BYTE[r10*1+rsi]
+ movzx ecx,BYTE[r10*1+rsp]
+ lea r10,[1+r10]
+ xor eax,ecx
+ mov BYTE[((-1))+r10*1+rdi],al
+ dec rdx
+ jnz NEAR $L$oop_tail4x
+
+$L$done4x:
+ movaps xmm6,XMMWORD[((-168))+r9]
+ movaps xmm7,XMMWORD[((-152))+r9]
+ movaps xmm8,XMMWORD[((-136))+r9]
+ movaps xmm9,XMMWORD[((-120))+r9]
+ movaps xmm10,XMMWORD[((-104))+r9]
+ movaps xmm11,XMMWORD[((-88))+r9]
+ movaps xmm12,XMMWORD[((-72))+r9]
+ movaps xmm13,XMMWORD[((-56))+r9]
+ movaps xmm14,XMMWORD[((-40))+r9]
+ movaps xmm15,XMMWORD[((-24))+r9]
+ lea rsp,[r9]
+
+$L$4x_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ChaCha20_ctr32_ssse3_4x:
+global ChaCha20_ctr32_avx2
+
+ALIGN 32
+ChaCha20_ctr32_avx2:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ChaCha20_ctr32_avx2:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+
+
+
+_CET_ENDBR
+ mov r9,rsp
+
+ sub rsp,0x280+168
+ and rsp,-32
+ movaps XMMWORD[(-168)+r9],xmm6
+ movaps XMMWORD[(-152)+r9],xmm7
+ movaps XMMWORD[(-136)+r9],xmm8
+ movaps XMMWORD[(-120)+r9],xmm9
+ movaps XMMWORD[(-104)+r9],xmm10
+ movaps XMMWORD[(-88)+r9],xmm11
+ movaps XMMWORD[(-72)+r9],xmm12
+ movaps XMMWORD[(-56)+r9],xmm13
+ movaps XMMWORD[(-40)+r9],xmm14
+ movaps XMMWORD[(-24)+r9],xmm15
+$L$8x_body:
+ vzeroupper
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 ymm11,XMMWORD[$L$sigma]
+ vbroadcasti128 ymm3,XMMWORD[rcx]
+ vbroadcasti128 ymm15,XMMWORD[16+rcx]
+ vbroadcasti128 ymm7,XMMWORD[r8]
+ lea rcx,[256+rsp]
+ lea rax,[512+rsp]
+ lea r10,[$L$rot16]
+ lea r11,[$L$rot24]
+
+ vpshufd ymm8,ymm11,0x00
+ vpshufd ymm9,ymm11,0x55
+ vmovdqa YMMWORD[(128-256)+rcx],ymm8
+ vpshufd ymm10,ymm11,0xaa
+ vmovdqa YMMWORD[(160-256)+rcx],ymm9
+ vpshufd ymm11,ymm11,0xff
+ vmovdqa YMMWORD[(192-256)+rcx],ymm10
+ vmovdqa YMMWORD[(224-256)+rcx],ymm11
+
+ vpshufd ymm0,ymm3,0x00
+ vpshufd ymm1,ymm3,0x55
+ vmovdqa YMMWORD[(256-256)+rcx],ymm0
+ vpshufd ymm2,ymm3,0xaa
+ vmovdqa YMMWORD[(288-256)+rcx],ymm1
+ vpshufd ymm3,ymm3,0xff
+ vmovdqa YMMWORD[(320-256)+rcx],ymm2
+ vmovdqa YMMWORD[(352-256)+rcx],ymm3
+
+ vpshufd ymm12,ymm15,0x00
+ vpshufd ymm13,ymm15,0x55
+ vmovdqa YMMWORD[(384-512)+rax],ymm12
+ vpshufd ymm14,ymm15,0xaa
+ vmovdqa YMMWORD[(416-512)+rax],ymm13
+ vpshufd ymm15,ymm15,0xff
+ vmovdqa YMMWORD[(448-512)+rax],ymm14
+ vmovdqa YMMWORD[(480-512)+rax],ymm15
+
+ vpshufd ymm4,ymm7,0x00
+ vpshufd ymm5,ymm7,0x55
+ vpaddd ymm4,ymm4,YMMWORD[$L$incy]
+ vpshufd ymm6,ymm7,0xaa
+ vmovdqa YMMWORD[(544-512)+rax],ymm5
+ vpshufd ymm7,ymm7,0xff
+ vmovdqa YMMWORD[(576-512)+rax],ymm6
+ vmovdqa YMMWORD[(608-512)+rax],ymm7
+
+ jmp NEAR $L$oop_enter8x
+
+ALIGN 32
+$L$oop_outer8x:
+ vmovdqa ymm8,YMMWORD[((128-256))+rcx]
+ vmovdqa ymm9,YMMWORD[((160-256))+rcx]
+ vmovdqa ymm10,YMMWORD[((192-256))+rcx]
+ vmovdqa ymm11,YMMWORD[((224-256))+rcx]
+ vmovdqa ymm0,YMMWORD[((256-256))+rcx]
+ vmovdqa ymm1,YMMWORD[((288-256))+rcx]
+ vmovdqa ymm2,YMMWORD[((320-256))+rcx]
+ vmovdqa ymm3,YMMWORD[((352-256))+rcx]
+ vmovdqa ymm12,YMMWORD[((384-512))+rax]
+ vmovdqa ymm13,YMMWORD[((416-512))+rax]
+ vmovdqa ymm14,YMMWORD[((448-512))+rax]
+ vmovdqa ymm15,YMMWORD[((480-512))+rax]
+ vmovdqa ymm4,YMMWORD[((512-512))+rax]
+ vmovdqa ymm5,YMMWORD[((544-512))+rax]
+ vmovdqa ymm6,YMMWORD[((576-512))+rax]
+ vmovdqa ymm7,YMMWORD[((608-512))+rax]
+ vpaddd ymm4,ymm4,YMMWORD[$L$eight]
+
+$L$oop_enter8x:
+ vmovdqa YMMWORD[64+rsp],ymm14
+ vmovdqa YMMWORD[96+rsp],ymm15
+ vbroadcasti128 ymm15,XMMWORD[r10]
+ vmovdqa YMMWORD[(512-512)+rax],ymm4
+ mov eax,10
+ jmp NEAR $L$oop8x
+
+ALIGN 32
+$L$oop8x:
+ vpaddd ymm8,ymm8,ymm0
+ vpxor ymm4,ymm8,ymm4
+ vpshufb ymm4,ymm4,ymm15
+ vpaddd ymm9,ymm9,ymm1
+ vpxor ymm5,ymm9,ymm5
+ vpshufb ymm5,ymm5,ymm15
+ vpaddd ymm12,ymm12,ymm4
+ vpxor ymm0,ymm12,ymm0
+ vpslld ymm14,ymm0,12
+ vpsrld ymm0,ymm0,20
+ vpor ymm0,ymm14,ymm0
+ vbroadcasti128 ymm14,XMMWORD[r11]
+ vpaddd ymm13,ymm13,ymm5
+ vpxor ymm1,ymm13,ymm1
+ vpslld ymm15,ymm1,12
+ vpsrld ymm1,ymm1,20
+ vpor ymm1,ymm15,ymm1
+ vpaddd ymm8,ymm8,ymm0
+ vpxor ymm4,ymm8,ymm4
+ vpshufb ymm4,ymm4,ymm14
+ vpaddd ymm9,ymm9,ymm1
+ vpxor ymm5,ymm9,ymm5
+ vpshufb ymm5,ymm5,ymm14
+ vpaddd ymm12,ymm12,ymm4
+ vpxor ymm0,ymm12,ymm0
+ vpslld ymm15,ymm0,7
+ vpsrld ymm0,ymm0,25
+ vpor ymm0,ymm15,ymm0
+ vbroadcasti128 ymm15,XMMWORD[r10]
+ vpaddd ymm13,ymm13,ymm5
+ vpxor ymm1,ymm13,ymm1
+ vpslld ymm14,ymm1,7
+ vpsrld ymm1,ymm1,25
+ vpor ymm1,ymm14,ymm1
+ vmovdqa YMMWORD[rsp],ymm12
+ vmovdqa YMMWORD[32+rsp],ymm13
+ vmovdqa ymm12,YMMWORD[64+rsp]
+ vmovdqa ymm13,YMMWORD[96+rsp]
+ vpaddd ymm10,ymm10,ymm2
+ vpxor ymm6,ymm10,ymm6
+ vpshufb ymm6,ymm6,ymm15
+ vpaddd ymm11,ymm11,ymm3
+ vpxor ymm7,ymm11,ymm7
+ vpshufb ymm7,ymm7,ymm15
+ vpaddd ymm12,ymm12,ymm6
+ vpxor ymm2,ymm12,ymm2
+ vpslld ymm14,ymm2,12
+ vpsrld ymm2,ymm2,20
+ vpor ymm2,ymm14,ymm2
+ vbroadcasti128 ymm14,XMMWORD[r11]
+ vpaddd ymm13,ymm13,ymm7
+ vpxor ymm3,ymm13,ymm3
+ vpslld ymm15,ymm3,12
+ vpsrld ymm3,ymm3,20
+ vpor ymm3,ymm15,ymm3
+ vpaddd ymm10,ymm10,ymm2
+ vpxor ymm6,ymm10,ymm6
+ vpshufb ymm6,ymm6,ymm14
+ vpaddd ymm11,ymm11,ymm3
+ vpxor ymm7,ymm11,ymm7
+ vpshufb ymm7,ymm7,ymm14
+ vpaddd ymm12,ymm12,ymm6
+ vpxor ymm2,ymm12,ymm2
+ vpslld ymm15,ymm2,7
+ vpsrld ymm2,ymm2,25
+ vpor ymm2,ymm15,ymm2
+ vbroadcasti128 ymm15,XMMWORD[r10]
+ vpaddd ymm13,ymm13,ymm7
+ vpxor ymm3,ymm13,ymm3
+ vpslld ymm14,ymm3,7
+ vpsrld ymm3,ymm3,25
+ vpor ymm3,ymm14,ymm3
+ vpaddd ymm8,ymm8,ymm1
+ vpxor ymm7,ymm8,ymm7
+ vpshufb ymm7,ymm7,ymm15
+ vpaddd ymm9,ymm9,ymm2
+ vpxor ymm4,ymm9,ymm4
+ vpshufb ymm4,ymm4,ymm15
+ vpaddd ymm12,ymm12,ymm7
+ vpxor ymm1,ymm12,ymm1
+ vpslld ymm14,ymm1,12
+ vpsrld ymm1,ymm1,20
+ vpor ymm1,ymm14,ymm1
+ vbroadcasti128 ymm14,XMMWORD[r11]
+ vpaddd ymm13,ymm13,ymm4
+ vpxor ymm2,ymm13,ymm2
+ vpslld ymm15,ymm2,12
+ vpsrld ymm2,ymm2,20
+ vpor ymm2,ymm15,ymm2
+ vpaddd ymm8,ymm8,ymm1
+ vpxor ymm7,ymm8,ymm7
+ vpshufb ymm7,ymm7,ymm14
+ vpaddd ymm9,ymm9,ymm2
+ vpxor ymm4,ymm9,ymm4
+ vpshufb ymm4,ymm4,ymm14
+ vpaddd ymm12,ymm12,ymm7
+ vpxor ymm1,ymm12,ymm1
+ vpslld ymm15,ymm1,7
+ vpsrld ymm1,ymm1,25
+ vpor ymm1,ymm15,ymm1
+ vbroadcasti128 ymm15,XMMWORD[r10]
+ vpaddd ymm13,ymm13,ymm4
+ vpxor ymm2,ymm13,ymm2
+ vpslld ymm14,ymm2,7
+ vpsrld ymm2,ymm2,25
+ vpor ymm2,ymm14,ymm2
+ vmovdqa YMMWORD[64+rsp],ymm12
+ vmovdqa YMMWORD[96+rsp],ymm13
+ vmovdqa ymm12,YMMWORD[rsp]
+ vmovdqa ymm13,YMMWORD[32+rsp]
+ vpaddd ymm10,ymm10,ymm3
+ vpxor ymm5,ymm10,ymm5
+ vpshufb ymm5,ymm5,ymm15
+ vpaddd ymm11,ymm11,ymm0
+ vpxor ymm6,ymm11,ymm6
+ vpshufb ymm6,ymm6,ymm15
+ vpaddd ymm12,ymm12,ymm5
+ vpxor ymm3,ymm12,ymm3
+ vpslld ymm14,ymm3,12
+ vpsrld ymm3,ymm3,20
+ vpor ymm3,ymm14,ymm3
+ vbroadcasti128 ymm14,XMMWORD[r11]
+ vpaddd ymm13,ymm13,ymm6
+ vpxor ymm0,ymm13,ymm0
+ vpslld ymm15,ymm0,12
+ vpsrld ymm0,ymm0,20
+ vpor ymm0,ymm15,ymm0
+ vpaddd ymm10,ymm10,ymm3
+ vpxor ymm5,ymm10,ymm5
+ vpshufb ymm5,ymm5,ymm14
+ vpaddd ymm11,ymm11,ymm0
+ vpxor ymm6,ymm11,ymm6
+ vpshufb ymm6,ymm6,ymm14
+ vpaddd ymm12,ymm12,ymm5
+ vpxor ymm3,ymm12,ymm3
+ vpslld ymm15,ymm3,7
+ vpsrld ymm3,ymm3,25
+ vpor ymm3,ymm15,ymm3
+ vbroadcasti128 ymm15,XMMWORD[r10]
+ vpaddd ymm13,ymm13,ymm6
+ vpxor ymm0,ymm13,ymm0
+ vpslld ymm14,ymm0,7
+ vpsrld ymm0,ymm0,25
+ vpor ymm0,ymm14,ymm0
+ dec eax
+ jnz NEAR $L$oop8x
+
+ lea rax,[512+rsp]
+ vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx]
+ vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx]
+ vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx]
+ vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx]
+
+ vpunpckldq ymm14,ymm8,ymm9
+ vpunpckldq ymm15,ymm10,ymm11
+ vpunpckhdq ymm8,ymm8,ymm9
+ vpunpckhdq ymm10,ymm10,ymm11
+ vpunpcklqdq ymm9,ymm14,ymm15
+ vpunpckhqdq ymm14,ymm14,ymm15
+ vpunpcklqdq ymm11,ymm8,ymm10
+ vpunpckhqdq ymm8,ymm8,ymm10
+ vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx]
+ vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx]
+ vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx]
+ vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx]
+
+ vpunpckldq ymm10,ymm0,ymm1
+ vpunpckldq ymm15,ymm2,ymm3
+ vpunpckhdq ymm0,ymm0,ymm1
+ vpunpckhdq ymm2,ymm2,ymm3
+ vpunpcklqdq ymm1,ymm10,ymm15
+ vpunpckhqdq ymm10,ymm10,ymm15
+ vpunpcklqdq ymm3,ymm0,ymm2
+ vpunpckhqdq ymm0,ymm0,ymm2
+ vperm2i128 ymm15,ymm9,ymm1,0x20
+ vperm2i128 ymm1,ymm9,ymm1,0x31
+ vperm2i128 ymm9,ymm14,ymm10,0x20
+ vperm2i128 ymm10,ymm14,ymm10,0x31
+ vperm2i128 ymm14,ymm11,ymm3,0x20
+ vperm2i128 ymm3,ymm11,ymm3,0x31
+ vperm2i128 ymm11,ymm8,ymm0,0x20
+ vperm2i128 ymm0,ymm8,ymm0,0x31
+ vmovdqa YMMWORD[rsp],ymm15
+ vmovdqa YMMWORD[32+rsp],ymm9
+ vmovdqa ymm15,YMMWORD[64+rsp]
+ vmovdqa ymm9,YMMWORD[96+rsp]
+
+ vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax]
+ vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax]
+ vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax]
+ vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax]
+
+ vpunpckldq ymm2,ymm12,ymm13
+ vpunpckldq ymm8,ymm15,ymm9
+ vpunpckhdq ymm12,ymm12,ymm13
+ vpunpckhdq ymm15,ymm15,ymm9
+ vpunpcklqdq ymm13,ymm2,ymm8
+ vpunpckhqdq ymm2,ymm2,ymm8
+ vpunpcklqdq ymm9,ymm12,ymm15
+ vpunpckhqdq ymm12,ymm12,ymm15
+ vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax]
+ vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax]
+ vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax]
+ vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax]
+
+ vpunpckldq ymm15,ymm4,ymm5
+ vpunpckldq ymm8,ymm6,ymm7
+ vpunpckhdq ymm4,ymm4,ymm5
+ vpunpckhdq ymm6,ymm6,ymm7
+ vpunpcklqdq ymm5,ymm15,ymm8
+ vpunpckhqdq ymm15,ymm15,ymm8
+ vpunpcklqdq ymm7,ymm4,ymm6
+ vpunpckhqdq ymm4,ymm4,ymm6
+ vperm2i128 ymm8,ymm13,ymm5,0x20
+ vperm2i128 ymm5,ymm13,ymm5,0x31
+ vperm2i128 ymm13,ymm2,ymm15,0x20
+ vperm2i128 ymm15,ymm2,ymm15,0x31
+ vperm2i128 ymm2,ymm9,ymm7,0x20
+ vperm2i128 ymm7,ymm9,ymm7,0x31
+ vperm2i128 ymm9,ymm12,ymm4,0x20
+ vperm2i128 ymm4,ymm12,ymm4,0x31
+ vmovdqa ymm6,YMMWORD[rsp]
+ vmovdqa ymm12,YMMWORD[32+rsp]
+
+ cmp rdx,64*8
+ jb NEAR $L$tail8x
+
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vpxor ymm1,ymm1,YMMWORD[64+rsi]
+ vpxor ymm5,ymm5,YMMWORD[96+rsi]
+ lea rsi,[128+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ vmovdqu YMMWORD[64+rdi],ymm1
+ vmovdqu YMMWORD[96+rdi],ymm5
+ lea rdi,[128+rdi]
+
+ vpxor ymm12,ymm12,YMMWORD[rsi]
+ vpxor ymm13,ymm13,YMMWORD[32+rsi]
+ vpxor ymm10,ymm10,YMMWORD[64+rsi]
+ vpxor ymm15,ymm15,YMMWORD[96+rsi]
+ lea rsi,[128+rsi]
+ vmovdqu YMMWORD[rdi],ymm12
+ vmovdqu YMMWORD[32+rdi],ymm13
+ vmovdqu YMMWORD[64+rdi],ymm10
+ vmovdqu YMMWORD[96+rdi],ymm15
+ lea rdi,[128+rdi]
+
+ vpxor ymm14,ymm14,YMMWORD[rsi]
+ vpxor ymm2,ymm2,YMMWORD[32+rsi]
+ vpxor ymm3,ymm3,YMMWORD[64+rsi]
+ vpxor ymm7,ymm7,YMMWORD[96+rsi]
+ lea rsi,[128+rsi]
+ vmovdqu YMMWORD[rdi],ymm14
+ vmovdqu YMMWORD[32+rdi],ymm2
+ vmovdqu YMMWORD[64+rdi],ymm3
+ vmovdqu YMMWORD[96+rdi],ymm7
+ lea rdi,[128+rdi]
+
+ vpxor ymm11,ymm11,YMMWORD[rsi]
+ vpxor ymm9,ymm9,YMMWORD[32+rsi]
+ vpxor ymm0,ymm0,YMMWORD[64+rsi]
+ vpxor ymm4,ymm4,YMMWORD[96+rsi]
+ lea rsi,[128+rsi]
+ vmovdqu YMMWORD[rdi],ymm11
+ vmovdqu YMMWORD[32+rdi],ymm9
+ vmovdqu YMMWORD[64+rdi],ymm0
+ vmovdqu YMMWORD[96+rdi],ymm4
+ lea rdi,[128+rdi]
+
+ sub rdx,64*8
+ jnz NEAR $L$oop_outer8x
+
+ jmp NEAR $L$done8x
+
+$L$tail8x:
+ cmp rdx,448
+ jae NEAR $L$448_or_more8x
+ cmp rdx,384
+ jae NEAR $L$384_or_more8x
+ cmp rdx,320
+ jae NEAR $L$320_or_more8x
+ cmp rdx,256
+ jae NEAR $L$256_or_more8x
+ cmp rdx,192
+ jae NEAR $L$192_or_more8x
+ cmp rdx,128
+ jae NEAR $L$128_or_more8x
+ cmp rdx,64
+ jae NEAR $L$64_or_more8x
+
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm6
+ vmovdqa YMMWORD[32+rsp],ymm8
+ jmp NEAR $L$oop_tail8x
+
+ALIGN 32
+$L$64_or_more8x:
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ je NEAR $L$done8x
+
+ lea rsi,[64+rsi]
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm1
+ lea rdi,[64+rdi]
+ sub rdx,64
+ vmovdqa YMMWORD[32+rsp],ymm5
+ jmp NEAR $L$oop_tail8x
+
+ALIGN 32
+$L$128_or_more8x:
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vpxor ymm1,ymm1,YMMWORD[64+rsi]
+ vpxor ymm5,ymm5,YMMWORD[96+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ vmovdqu YMMWORD[64+rdi],ymm1
+ vmovdqu YMMWORD[96+rdi],ymm5
+ je NEAR $L$done8x
+
+ lea rsi,[128+rsi]
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm12
+ lea rdi,[128+rdi]
+ sub rdx,128
+ vmovdqa YMMWORD[32+rsp],ymm13
+ jmp NEAR $L$oop_tail8x
+
+ALIGN 32
+$L$192_or_more8x:
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vpxor ymm1,ymm1,YMMWORD[64+rsi]
+ vpxor ymm5,ymm5,YMMWORD[96+rsi]
+ vpxor ymm12,ymm12,YMMWORD[128+rsi]
+ vpxor ymm13,ymm13,YMMWORD[160+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ vmovdqu YMMWORD[64+rdi],ymm1
+ vmovdqu YMMWORD[96+rdi],ymm5
+ vmovdqu YMMWORD[128+rdi],ymm12
+ vmovdqu YMMWORD[160+rdi],ymm13
+ je NEAR $L$done8x
+
+ lea rsi,[192+rsi]
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm10
+ lea rdi,[192+rdi]
+ sub rdx,192
+ vmovdqa YMMWORD[32+rsp],ymm15
+ jmp NEAR $L$oop_tail8x
+
+ALIGN 32
+$L$256_or_more8x:
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vpxor ymm1,ymm1,YMMWORD[64+rsi]
+ vpxor ymm5,ymm5,YMMWORD[96+rsi]
+ vpxor ymm12,ymm12,YMMWORD[128+rsi]
+ vpxor ymm13,ymm13,YMMWORD[160+rsi]
+ vpxor ymm10,ymm10,YMMWORD[192+rsi]
+ vpxor ymm15,ymm15,YMMWORD[224+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ vmovdqu YMMWORD[64+rdi],ymm1
+ vmovdqu YMMWORD[96+rdi],ymm5
+ vmovdqu YMMWORD[128+rdi],ymm12
+ vmovdqu YMMWORD[160+rdi],ymm13
+ vmovdqu YMMWORD[192+rdi],ymm10
+ vmovdqu YMMWORD[224+rdi],ymm15
+ je NEAR $L$done8x
+
+ lea rsi,[256+rsi]
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm14
+ lea rdi,[256+rdi]
+ sub rdx,256
+ vmovdqa YMMWORD[32+rsp],ymm2
+ jmp NEAR $L$oop_tail8x
+
+ALIGN 32
+$L$320_or_more8x:
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vpxor ymm1,ymm1,YMMWORD[64+rsi]
+ vpxor ymm5,ymm5,YMMWORD[96+rsi]
+ vpxor ymm12,ymm12,YMMWORD[128+rsi]
+ vpxor ymm13,ymm13,YMMWORD[160+rsi]
+ vpxor ymm10,ymm10,YMMWORD[192+rsi]
+ vpxor ymm15,ymm15,YMMWORD[224+rsi]
+ vpxor ymm14,ymm14,YMMWORD[256+rsi]
+ vpxor ymm2,ymm2,YMMWORD[288+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ vmovdqu YMMWORD[64+rdi],ymm1
+ vmovdqu YMMWORD[96+rdi],ymm5
+ vmovdqu YMMWORD[128+rdi],ymm12
+ vmovdqu YMMWORD[160+rdi],ymm13
+ vmovdqu YMMWORD[192+rdi],ymm10
+ vmovdqu YMMWORD[224+rdi],ymm15
+ vmovdqu YMMWORD[256+rdi],ymm14
+ vmovdqu YMMWORD[288+rdi],ymm2
+ je NEAR $L$done8x
+
+ lea rsi,[320+rsi]
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm3
+ lea rdi,[320+rdi]
+ sub rdx,320
+ vmovdqa YMMWORD[32+rsp],ymm7
+ jmp NEAR $L$oop_tail8x
+
+ALIGN 32
+$L$384_or_more8x:
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vpxor ymm1,ymm1,YMMWORD[64+rsi]
+ vpxor ymm5,ymm5,YMMWORD[96+rsi]
+ vpxor ymm12,ymm12,YMMWORD[128+rsi]
+ vpxor ymm13,ymm13,YMMWORD[160+rsi]
+ vpxor ymm10,ymm10,YMMWORD[192+rsi]
+ vpxor ymm15,ymm15,YMMWORD[224+rsi]
+ vpxor ymm14,ymm14,YMMWORD[256+rsi]
+ vpxor ymm2,ymm2,YMMWORD[288+rsi]
+ vpxor ymm3,ymm3,YMMWORD[320+rsi]
+ vpxor ymm7,ymm7,YMMWORD[352+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ vmovdqu YMMWORD[64+rdi],ymm1
+ vmovdqu YMMWORD[96+rdi],ymm5
+ vmovdqu YMMWORD[128+rdi],ymm12
+ vmovdqu YMMWORD[160+rdi],ymm13
+ vmovdqu YMMWORD[192+rdi],ymm10
+ vmovdqu YMMWORD[224+rdi],ymm15
+ vmovdqu YMMWORD[256+rdi],ymm14
+ vmovdqu YMMWORD[288+rdi],ymm2
+ vmovdqu YMMWORD[320+rdi],ymm3
+ vmovdqu YMMWORD[352+rdi],ymm7
+ je NEAR $L$done8x
+
+ lea rsi,[384+rsi]
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm11
+ lea rdi,[384+rdi]
+ sub rdx,384
+ vmovdqa YMMWORD[32+rsp],ymm9
+ jmp NEAR $L$oop_tail8x
+
+ALIGN 32
+$L$448_or_more8x:
+ vpxor ymm6,ymm6,YMMWORD[rsi]
+ vpxor ymm8,ymm8,YMMWORD[32+rsi]
+ vpxor ymm1,ymm1,YMMWORD[64+rsi]
+ vpxor ymm5,ymm5,YMMWORD[96+rsi]
+ vpxor ymm12,ymm12,YMMWORD[128+rsi]
+ vpxor ymm13,ymm13,YMMWORD[160+rsi]
+ vpxor ymm10,ymm10,YMMWORD[192+rsi]
+ vpxor ymm15,ymm15,YMMWORD[224+rsi]
+ vpxor ymm14,ymm14,YMMWORD[256+rsi]
+ vpxor ymm2,ymm2,YMMWORD[288+rsi]
+ vpxor ymm3,ymm3,YMMWORD[320+rsi]
+ vpxor ymm7,ymm7,YMMWORD[352+rsi]
+ vpxor ymm11,ymm11,YMMWORD[384+rsi]
+ vpxor ymm9,ymm9,YMMWORD[416+rsi]
+ vmovdqu YMMWORD[rdi],ymm6
+ vmovdqu YMMWORD[32+rdi],ymm8
+ vmovdqu YMMWORD[64+rdi],ymm1
+ vmovdqu YMMWORD[96+rdi],ymm5
+ vmovdqu YMMWORD[128+rdi],ymm12
+ vmovdqu YMMWORD[160+rdi],ymm13
+ vmovdqu YMMWORD[192+rdi],ymm10
+ vmovdqu YMMWORD[224+rdi],ymm15
+ vmovdqu YMMWORD[256+rdi],ymm14
+ vmovdqu YMMWORD[288+rdi],ymm2
+ vmovdqu YMMWORD[320+rdi],ymm3
+ vmovdqu YMMWORD[352+rdi],ymm7
+ vmovdqu YMMWORD[384+rdi],ymm11
+ vmovdqu YMMWORD[416+rdi],ymm9
+ je NEAR $L$done8x
+
+ lea rsi,[448+rsi]
+ xor r10,r10
+ vmovdqa YMMWORD[rsp],ymm0
+ lea rdi,[448+rdi]
+ sub rdx,448
+ vmovdqa YMMWORD[32+rsp],ymm4
+
+$L$oop_tail8x:
+ movzx eax,BYTE[r10*1+rsi]
+ movzx ecx,BYTE[r10*1+rsp]
+ lea r10,[1+r10]
+ xor eax,ecx
+ mov BYTE[((-1))+r10*1+rdi],al
+ dec rdx
+ jnz NEAR $L$oop_tail8x
+
+$L$done8x:
+ vzeroall
+ movaps xmm6,XMMWORD[((-168))+r9]
+ movaps xmm7,XMMWORD[((-152))+r9]
+ movaps xmm8,XMMWORD[((-136))+r9]
+ movaps xmm9,XMMWORD[((-120))+r9]
+ movaps xmm10,XMMWORD[((-104))+r9]
+ movaps xmm11,XMMWORD[((-88))+r9]
+ movaps xmm12,XMMWORD[((-72))+r9]
+ movaps xmm13,XMMWORD[((-56))+r9]
+ movaps xmm14,XMMWORD[((-40))+r9]
+ movaps xmm15,XMMWORD[((-24))+r9]
+ lea rsp,[r9]
+
+$L$8x_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$SEH_end_ChaCha20_ctr32_avx2:
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ lea r10,[$L$ctr32_body]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ lea r10,[$L$no_data]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rax,[((64+24+48))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ ret
+
+
+
+ALIGN 16
+ssse3_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[192+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-40))+rax]
+ lea rdi,[512+r8]
+ mov ecx,4
+ DD 0xa548f3fc
+
+ jmp NEAR $L$common_seh_tail
+
+
+
+ALIGN 16
+full_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[192+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-168))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+ jmp NEAR $L$common_seh_tail
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase
+
+ DD $L$SEH_begin_ChaCha20_ctr32_ssse3 wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_ctr32_ssse3 wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_ctr32_ssse3 wrt ..imagebase
+
+ DD $L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
+ DD $L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_ChaCha20_ctr32_nohw:
+ DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ctr32_ssse3:
+ DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ctr32_ssse3_4x:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
+$L$SEH_info_ChaCha20_ctr32_avx2:
+ DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/chacha20_poly1305_armv8-apple.S b/gen/crypto/chacha20_poly1305_armv8-apple.S
new file mode 100644
index 0000000..04a1e22
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_armv8-apple.S
@@ -0,0 +1,3009 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+.section __TEXT,__const
+
+.align 7
+Lchacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+Linc:
+.long 1,2,3,4
+Lrol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+Lclamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+
+.align 6
+Lpoly_hash_ad_internal:
+.cfi_startproc
+ cbnz x4, Lpoly_hash_intro
+ ret
+
+Lpoly_hash_intro:
+ cmp x4, #16
+ b.lt Lpoly_hash_ad_tail
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #16
+ b Lpoly_hash_ad_internal
+
+Lpoly_hash_ad_tail:
+ cbz x4, Lpoly_hash_ad_ret
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+ sub x4, x4, #1
+
+Lpoly_hash_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, x4]
+ mov v20.b[0], w11
+ subs x4, x4, #1
+ b.ge Lpoly_hash_tail_16_compose
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+Lpoly_hash_ad_ret:
+ ret
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl _chacha20_poly1305_seal
+.private_extern _chacha20_poly1305_seal
+
+.align 6
+_chacha20_poly1305_seal:
+ AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+ stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+ mov x29, sp
+ // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+ // we don't actually use the frame pointer like that, it's probably not
+ // worth bothering.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+ adrp x11, Lchacha20_consts@PAGE
+ add x11, x11, Lchacha20_consts@PAGEOFF
+
+ ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+ ld1 {v28.16b - v30.16b}, [x5]
+
+ mov x15, #1 // Prepare the Poly1305 state
+ mov x8, #0
+ mov x9, #0
+ mov x10, #0
+
+ ldr x12, [x5, #56] // The total cipher text length includes extra_in_len
+ add x12, x12, x2
+ mov v31.d[0], x4 // Store the input and aad lengths
+ mov v31.d[1], x12
+
+ cmp x2, #128
+ b.le Lseal_128 // Optimization for smaller buffers
+
+ // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+ // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+ // the fifth block (A4-D4) horizontally.
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ sub x5, x5, #32
+
+ mov x6, #10
+
+.align 5
+Lseal_init_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x6, x6, #1
+ b.hi Lseal_init_rounds
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #4
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ and v4.16b, v4.16b, v27.16b
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ mov x16, v4.d[0] // Move the R key to GPRs
+ mov x17, v4.d[1]
+ mov v27.16b, v9.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+
+ mov x3, x0
+ cmp x2, #256
+ b.le Lseal_tail
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #256
+
+ mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+ mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+Lseal_main_loop:
+ adrp x11, Lchacha20_consts@PAGE
+ add x11, x11, Lchacha20_consts@PAGEOFF
+
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ sub x5, x5, #32
+.align 5
+Lseal_main_loop_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x6, x6, #1
+ b.ge Lseal_main_loop_rounds
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ subs x7, x7, #1
+ b.gt Lseal_main_loop_rounds
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #5
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ add v14.4s, v14.4s, v29.4s
+ add v19.4s, v19.4s, v30.4s
+
+ cmp x2, #320
+ b.le Lseal_tail
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v19.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #320
+
+ mov x6, #0
+ mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+ b Lseal_main_loop
+
+Lseal_tail:
+ // This part of the function handles the storage and authentication of the last [0,320) bytes
+ // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+ cmp x2, #64
+ b.lt Lseal_tail_64
+
+ // Store and authenticate 64B blocks per iteration
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v22.d[0]
+ mov x12, v22.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v23.d[0]
+ mov x12, v23.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ st1 {v20.16b - v23.16b}, [x0], #64
+ sub x2, x2, #64
+
+ // Shift the state left by 64 bytes for the next iteration of the loop
+ mov v0.16b, v1.16b
+ mov v5.16b, v6.16b
+ mov v10.16b, v11.16b
+ mov v15.16b, v16.16b
+
+ mov v1.16b, v2.16b
+ mov v6.16b, v7.16b
+ mov v11.16b, v12.16b
+ mov v16.16b, v17.16b
+
+ mov v2.16b, v3.16b
+ mov v7.16b, v8.16b
+ mov v12.16b, v13.16b
+ mov v17.16b, v18.16b
+
+ mov v3.16b, v4.16b
+ mov v8.16b, v9.16b
+ mov v13.16b, v14.16b
+ mov v18.16b, v19.16b
+
+ b Lseal_tail
+
+Lseal_tail_64:
+ ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+ // Here we handle the last [0,64) bytes of plaintext
+ cmp x2, #16
+ b.lt Lseal_tail_16
+ // Each iteration encrypt and authenticate a 16B block
+ ld1 {v20.16b}, [x1], #16
+ eor v20.16b, v20.16b, v0.16b
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ st1 {v20.16b}, [x0], #16
+
+ sub x2, x2, #16
+
+ // Shift the state left by 16 bytes for the next iteration of the loop
+ mov v0.16b, v5.16b
+ mov v5.16b, v10.16b
+ mov v10.16b, v15.16b
+
+ b Lseal_tail_64
+
+Lseal_tail_16:
+ // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+ cbz x2, Lseal_hash_extra
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+ eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+ not v22.16b, v20.16b
+
+ mov x6, x2
+ add x1, x1, x2
+
+ cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+ mov x7, #16 // We need to load some extra_in first for padding
+ sub x7, x7, x2
+ cmp x4, x7
+ csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+ mov x12, x7
+ add x3, x3, x7
+ sub x4, x4, x7
+
+Lseal_tail16_compose_extra_in:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, #-1]!
+ mov v20.b[0], w11
+ subs x7, x7, #1
+ b.gt Lseal_tail16_compose_extra_in
+
+ add x3, x3, x12
+
+Lseal_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x1, #-1]!
+ mov v20.b[0], w11
+ ext v21.16b, v22.16b, v21.16b, #15
+ subs x2, x2, #1
+ b.gt Lseal_tail_16_compose
+
+ and v0.16b, v0.16b, v21.16b
+ eor v20.16b, v20.16b, v0.16b
+ mov v21.16b, v20.16b
+
+Lseal_tail_16_store:
+ umov w11, v20.b[0]
+ strb w11, [x0], #1
+ ext v20.16b, v20.16b, v20.16b, #1
+ subs x6, x6, #1
+ b.gt Lseal_tail_16_store
+
+ // Hash in the final ct block concatenated with extra_in
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+Lseal_hash_extra:
+ cbz x4, Lseal_finalize
+
+Lseal_hash_extra_loop:
+ cmp x4, #16
+ b.lt Lseal_hash_extra_tail
+ ld1 {v20.16b}, [x3], #16
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #16
+ b Lseal_hash_extra_loop
+
+Lseal_hash_extra_tail:
+ cbz x4, Lseal_finalize
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+ add x3, x3, x4
+
+Lseal_hash_extra_load:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, #-1]!
+ mov v20.b[0], w11
+ subs x4, x4, #1
+ b.gt Lseal_hash_extra_load
+
+ // Hash in the final padded extra_in blcok
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+Lseal_finalize:
+ mov x11, v31.d[0]
+ mov x12, v31.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ // Final reduction step
+ sub x12, xzr, x15
+ orr x13, xzr, #3
+ subs x11, x8, #-5
+ sbcs x12, x9, x12
+ sbcs x13, x10, x13
+ csel x8, x11, x8, cs
+ csel x9, x12, x9, cs
+ csel x10, x13, x10, cs
+ mov x11, v27.d[0]
+ mov x12, v27.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+
+ stp x8, x9, [x5]
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+ ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+Lseal_128:
+ // On some architectures preparing 5 blocks for small buffers is wasteful
+ eor v25.16b, v25.16b, v25.16b
+ mov x11, #1
+ mov v25.s[0], w11
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v17.16b, v30.16b
+ add v15.4s, v17.4s, v25.4s
+ add v16.4s, v15.4s, v25.4s
+
+ mov x6, #10
+
+Lseal_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x6, x6, #1
+ b.hi Lseal_128_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+
+ // Only the first 32 bytes of the third block (counter = 0) are needed,
+ // so skip updating v12 and v17.
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+
+ add v30.4s, v30.4s, v25.4s
+ add v15.4s, v15.4s, v30.4s
+ add v30.4s, v30.4s, v25.4s
+ add v16.4s, v16.4s, v30.4s
+
+ and v2.16b, v2.16b, v27.16b
+ mov x16, v2.d[0] // Move the R key to GPRs
+ mov x17, v2.d[1]
+ mov v27.16b, v7.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+ b Lseal_tail
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl _chacha20_poly1305_open
+.private_extern _chacha20_poly1305_open
+
+.align 6
+_chacha20_poly1305_open:
+ AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+ stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+ mov x29, sp
+ // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+ // we don't actually use the frame pointer like that, it's probably not
+ // worth bothering.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+ adrp x11, Lchacha20_consts@PAGE
+ add x11, x11, Lchacha20_consts@PAGEOFF
+
+ ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+ ld1 {v28.16b - v30.16b}, [x5]
+
+ mov x15, #1 // Prepare the Poly1305 state
+ mov x8, #0
+ mov x9, #0
+ mov x10, #0
+
+ mov v31.d[0], x4 // Store the input and aad lengths
+ mov v31.d[1], x2
+
+ cmp x2, #128
+ b.le Lopen_128 // Optimization for smaller buffers
+
+ // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+ mov v0.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v15.16b, v30.16b
+
+ mov x6, #10
+
+.align 5
+Lopen_init_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ subs x6, x6, #1
+ b.hi Lopen_init_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+
+ and v0.16b, v0.16b, v27.16b
+ mov x16, v0.d[0] // Move the R key to GPRs
+ mov x17, v0.d[1]
+ mov v27.16b, v5.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+
+Lopen_ad_done:
+ mov x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+Lopen_main_loop:
+
+ cmp x2, #192
+ b.lt Lopen_tail
+
+ adrp x11, Lchacha20_consts@PAGE
+ add x11, x11, Lchacha20_consts@PAGEOFF
+
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ sub x5, x5, #32
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+ sub x4, x4, #10
+
+ mov x7, #10
+ subs x6, x7, x4
+ subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+ csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+ cbz x7, Lopen_main_loop_rounds_short
+
+.align 5
+Lopen_main_loop_rounds:
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+Lopen_main_loop_rounds_short:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x7, x7, #1
+ b.gt Lopen_main_loop_rounds
+ subs x6, x6, #1
+ b.ge Lopen_main_loop_rounds_short
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #5
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ add v14.4s, v14.4s, v29.4s
+ add v19.4s, v19.4s, v30.4s
+
+ // We can always safely store 192 bytes
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #192
+
+ mov v0.16b, v3.16b
+ mov v5.16b, v8.16b
+ mov v10.16b, v13.16b
+ mov v15.16b, v18.16b
+
+ cmp x2, #64
+ b.lt Lopen_tail_64_store
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+
+ mov v0.16b, v4.16b
+ mov v5.16b, v9.16b
+ mov v10.16b, v14.16b
+ mov v15.16b, v19.16b
+
+ cmp x2, #64
+ b.lt Lopen_tail_64_store
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v19.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+ b Lopen_main_loop
+
+Lopen_tail:
+
+ cbz x2, Lopen_finalize
+
+ lsr x4, x2, #4 // How many whole blocks we have to hash
+
+ cmp x2, #64
+ b.le Lopen_tail_64
+ cmp x2, #128
+ b.le Lopen_tail_128
+
+Lopen_tail_192:
+ // We need three more blocks
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v15.16b, v30.16b
+ mov v16.16b, v30.16b
+ mov v17.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ eor v21.16b, v21.16b, v21.16b
+ ins v23.s[0], v25.s[0]
+ ins v21.d[0], x15
+
+ add v22.4s, v23.4s, v21.4s
+ add v21.4s, v22.4s, v21.4s
+
+ add v15.4s, v15.4s, v21.4s
+ add v16.4s, v16.4s, v23.4s
+ add v17.4s, v17.4s, v22.4s
+
+ mov x7, #10
+ subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+ csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+ sub x4, x4, x7
+
+ cbz x7, Lopen_tail_192_rounds_no_hash
+
+Lopen_tail_192_rounds:
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+Lopen_tail_192_rounds_no_hash:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x7, x7, #1
+ b.gt Lopen_tail_192_rounds
+ subs x6, x6, #1
+ b.ge Lopen_tail_192_rounds_no_hash
+
+ // We hashed 160 bytes at most, may still have 32 bytes left
+Lopen_tail_192_hash:
+ cbz x4, Lopen_tail_192_hash_done
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #1
+ b Lopen_tail_192_hash
+
+Lopen_tail_192_hash_done:
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+ add v12.4s, v12.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v15.4s, v15.4s, v21.4s
+ add v16.4s, v16.4s, v23.4s
+ add v17.4s, v17.4s, v22.4s
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #128
+ b Lopen_tail_64_store
+
+Lopen_tail_128:
+ // We need two more blocks
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v15.16b, v30.16b
+ mov v16.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ eor v22.16b, v22.16b, v22.16b
+ ins v23.s[0], v25.s[0]
+ ins v22.d[0], x15
+ add v22.4s, v22.4s, v23.4s
+
+ add v15.4s, v15.4s, v22.4s
+ add v16.4s, v16.4s, v23.4s
+
+ mov x6, #10
+ sub x6, x6, x4
+
+Lopen_tail_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v1.4s, v1.4s, v6.4s
+ eor v16.16b, v16.16b, v1.16b
+ rev32 v16.8h, v16.8h
+
+ add v11.4s, v11.4s, v16.4s
+ eor v6.16b, v6.16b, v11.16b
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ add v1.4s, v1.4s, v20.4s
+ eor v16.16b, v16.16b, v1.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+
+ add v11.4s, v11.4s, v16.4s
+ eor v20.16b, v20.16b, v11.16b
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v16.16b, v16.16b, v16.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ add v1.4s, v1.4s, v6.4s
+ eor v16.16b, v16.16b, v1.16b
+ rev32 v16.8h, v16.8h
+
+ add v11.4s, v11.4s, v16.4s
+ eor v6.16b, v6.16b, v11.16b
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ add v1.4s, v1.4s, v20.4s
+ eor v16.16b, v16.16b, v1.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+
+ add v11.4s, v11.4s, v16.4s
+ eor v20.16b, v20.16b, v11.16b
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v16.16b, v16.16b, v16.16b, #4
+ subs x6, x6, #1
+ b.gt Lopen_tail_128_rounds
+ cbz x4, Lopen_tail_128_rounds_done
+ subs x4, x4, #1
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ b Lopen_tail_128_rounds
+
+Lopen_tail_128_rounds_done:
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v15.4s, v15.4s, v22.4s
+ add v16.4s, v16.4s, v23.4s
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+ sub x2, x2, #64
+
+ b Lopen_tail_64_store
+
+Lopen_tail_64:
+ // We just need a single block
+ mov v0.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v15.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ ins v23.s[0], v25.s[0]
+ add v15.4s, v15.4s, v23.4s
+
+ mov x6, #10
+ sub x6, x6, x4
+
+Lopen_tail_64_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ subs x6, x6, #1
+ b.gt Lopen_tail_64_rounds
+ cbz x4, Lopen_tail_64_rounds_done
+ subs x4, x4, #1
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ b Lopen_tail_64_rounds
+
+Lopen_tail_64_rounds_done:
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v15.4s, v15.4s, v23.4s
+
+Lopen_tail_64_store:
+ cmp x2, #16
+ b.lt Lopen_tail_16
+
+ ld1 {v20.16b}, [x1], #16
+ eor v20.16b, v20.16b, v0.16b
+ st1 {v20.16b}, [x0], #16
+ mov v0.16b, v5.16b
+ mov v5.16b, v10.16b
+ mov v10.16b, v15.16b
+ sub x2, x2, #16
+ b Lopen_tail_64_store
+
+Lopen_tail_16:
+ // Here we handle the last [0,16) bytes that require a padded block
+ cbz x2, Lopen_finalize
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+ eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+ not v22.16b, v20.16b
+
+ add x7, x1, x2
+ mov x6, x2
+
+Lopen_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x7, #-1]!
+ mov v20.b[0], w11
+ ext v21.16b, v22.16b, v21.16b, #15
+ subs x2, x2, #1
+ b.gt Lopen_tail_16_compose
+
+ and v20.16b, v20.16b, v21.16b
+ // Hash in the final padded block
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ eor v20.16b, v20.16b, v0.16b
+
+Lopen_tail_16_store:
+ umov w11, v20.b[0]
+ strb w11, [x0], #1
+ ext v20.16b, v20.16b, v20.16b, #1
+ subs x6, x6, #1
+ b.gt Lopen_tail_16_store
+
+Lopen_finalize:
+ mov x11, v31.d[0]
+ mov x12, v31.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ // Final reduction step
+ sub x12, xzr, x15
+ orr x13, xzr, #3
+ subs x11, x8, #-5
+ sbcs x12, x9, x12
+ sbcs x13, x10, x13
+ csel x8, x11, x8, cs
+ csel x9, x12, x9, cs
+ csel x10, x13, x10, cs
+ mov x11, v27.d[0]
+ mov x12, v27.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+
+ stp x8, x9, [x5]
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+ ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+Lopen_128:
+ // On some architectures preparing 5 blocks for small buffers is wasteful
+ eor v25.16b, v25.16b, v25.16b
+ mov x11, #1
+ mov v25.s[0], w11
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v17.16b, v30.16b
+ add v15.4s, v17.4s, v25.4s
+ add v16.4s, v15.4s, v25.4s
+
+ mov x6, #10
+
+Lopen_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x6, x6, #1
+ b.hi Lopen_128_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+
+ add v30.4s, v30.4s, v25.4s
+ add v15.4s, v15.4s, v30.4s
+ add v30.4s, v30.4s, v25.4s
+ add v16.4s, v16.4s, v30.4s
+
+ and v2.16b, v2.16b, v27.16b
+ mov x16, v2.d[0] // Move the R key to GPRs
+ mov x17, v2.d[1]
+ mov v27.16b, v7.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+
+Lopen_128_store:
+ cmp x2, #64
+ b.lt Lopen_128_store_64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v22.d[0]
+ mov x12, v22.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v23.d[0]
+ mov x12, v23.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+
+ mov v0.16b, v1.16b
+ mov v5.16b, v6.16b
+ mov v10.16b, v11.16b
+ mov v15.16b, v16.16b
+
+Lopen_128_store_64:
+
+ lsr x4, x2, #4
+ mov x3, x1
+
+Lopen_128_hash_64:
+ cbz x4, Lopen_tail_64_store
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #1
+ b Lopen_128_hash_64
+.cfi_endproc
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/crypto/chacha20_poly1305_armv8-linux.S b/gen/crypto/chacha20_poly1305_armv8-linux.S
new file mode 100644
index 0000000..7d2db8d
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_armv8-linux.S
@@ -0,0 +1,3009 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+.section .rodata
+
+.align 7
+.Lchacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Linc:
+.long 1,2,3,4
+.Lrol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lclamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.type .Lpoly_hash_ad_internal,%function
+.align 6
+.Lpoly_hash_ad_internal:
+.cfi_startproc
+ cbnz x4, .Lpoly_hash_intro
+ ret
+
+.Lpoly_hash_intro:
+ cmp x4, #16
+ b.lt .Lpoly_hash_ad_tail
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #16
+ b .Lpoly_hash_ad_internal
+
+.Lpoly_hash_ad_tail:
+ cbz x4, .Lpoly_hash_ad_ret
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+ sub x4, x4, #1
+
+.Lpoly_hash_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, x4]
+ mov v20.b[0], w11
+ subs x4, x4, #1
+ b.ge .Lpoly_hash_tail_16_compose
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+.Lpoly_hash_ad_ret:
+ ret
+.cfi_endproc
+.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl chacha20_poly1305_seal
+.hidden chacha20_poly1305_seal
+.type chacha20_poly1305_seal,%function
+.align 6
+chacha20_poly1305_seal:
+ AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+ stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+ mov x29, sp
+ // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+ // we don't actually use the frame pointer like that, it's probably not
+ // worth bothering.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+ adrp x11, .Lchacha20_consts
+ add x11, x11, :lo12:.Lchacha20_consts
+
+ ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
+ ld1 {v28.16b - v30.16b}, [x5]
+
+ mov x15, #1 // Prepare the Poly1305 state
+ mov x8, #0
+ mov x9, #0
+ mov x10, #0
+
+ ldr x12, [x5, #56] // The total cipher text length includes extra_in_len
+ add x12, x12, x2
+ mov v31.d[0], x4 // Store the input and aad lengths
+ mov v31.d[1], x12
+
+ cmp x2, #128
+ b.le .Lseal_128 // Optimization for smaller buffers
+
+ // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+ // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+ // the fifth block (A4-D4) horizontally.
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ sub x5, x5, #32
+
+ mov x6, #10
+
+.align 5
+.Lseal_init_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x6, x6, #1
+ b.hi .Lseal_init_rounds
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #4
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ and v4.16b, v4.16b, v27.16b
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ mov x16, v4.d[0] // Move the R key to GPRs
+ mov x17, v4.d[1]
+ mov v27.16b, v9.16b // Store the S key
+
+ bl .Lpoly_hash_ad_internal
+
+ mov x3, x0
+ cmp x2, #256
+ b.le .Lseal_tail
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #256
+
+ mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+ mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+.Lseal_main_loop:
+ adrp x11, .Lchacha20_consts
+ add x11, x11, :lo12:.Lchacha20_consts
+
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ sub x5, x5, #32
+.align 5
+.Lseal_main_loop_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x6, x6, #1
+ b.ge .Lseal_main_loop_rounds
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ subs x7, x7, #1
+ b.gt .Lseal_main_loop_rounds
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #5
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ add v14.4s, v14.4s, v29.4s
+ add v19.4s, v19.4s, v30.4s
+
+ cmp x2, #320
+ b.le .Lseal_tail
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v19.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #320
+
+ mov x6, #0
+ mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+ b .Lseal_main_loop
+
+.Lseal_tail:
+ // This part of the function handles the storage and authentication of the last [0,320) bytes
+ // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+ cmp x2, #64
+ b.lt .Lseal_tail_64
+
+ // Store and authenticate 64B blocks per iteration
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v22.d[0]
+ mov x12, v22.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v23.d[0]
+ mov x12, v23.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ st1 {v20.16b - v23.16b}, [x0], #64
+ sub x2, x2, #64
+
+ // Shift the state left by 64 bytes for the next iteration of the loop
+ mov v0.16b, v1.16b
+ mov v5.16b, v6.16b
+ mov v10.16b, v11.16b
+ mov v15.16b, v16.16b
+
+ mov v1.16b, v2.16b
+ mov v6.16b, v7.16b
+ mov v11.16b, v12.16b
+ mov v16.16b, v17.16b
+
+ mov v2.16b, v3.16b
+ mov v7.16b, v8.16b
+ mov v12.16b, v13.16b
+ mov v17.16b, v18.16b
+
+ mov v3.16b, v4.16b
+ mov v8.16b, v9.16b
+ mov v13.16b, v14.16b
+ mov v18.16b, v19.16b
+
+ b .Lseal_tail
+
+.Lseal_tail_64:
+ ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+ // Here we handle the last [0,64) bytes of plaintext
+ cmp x2, #16
+ b.lt .Lseal_tail_16
+ // Each iteration encrypt and authenticate a 16B block
+ ld1 {v20.16b}, [x1], #16
+ eor v20.16b, v20.16b, v0.16b
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ st1 {v20.16b}, [x0], #16
+
+ sub x2, x2, #16
+
+ // Shift the state left by 16 bytes for the next iteration of the loop
+ mov v0.16b, v5.16b
+ mov v5.16b, v10.16b
+ mov v10.16b, v15.16b
+
+ b .Lseal_tail_64
+
+.Lseal_tail_16:
+ // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+ cbz x2, .Lseal_hash_extra
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+ eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+ not v22.16b, v20.16b
+
+ mov x6, x2
+ add x1, x1, x2
+
+ cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+ mov x7, #16 // We need to load some extra_in first for padding
+ sub x7, x7, x2
+ cmp x4, x7
+ csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register
+ mov x12, x7
+ add x3, x3, x7
+ sub x4, x4, x7
+
+.Lseal_tail16_compose_extra_in:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, #-1]!
+ mov v20.b[0], w11
+ subs x7, x7, #1
+ b.gt .Lseal_tail16_compose_extra_in
+
+ add x3, x3, x12
+
+.Lseal_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x1, #-1]!
+ mov v20.b[0], w11
+ ext v21.16b, v22.16b, v21.16b, #15
+ subs x2, x2, #1
+ b.gt .Lseal_tail_16_compose
+
+ and v0.16b, v0.16b, v21.16b
+ eor v20.16b, v20.16b, v0.16b
+ mov v21.16b, v20.16b
+
+.Lseal_tail_16_store:
+ umov w11, v20.b[0]
+ strb w11, [x0], #1
+ ext v20.16b, v20.16b, v20.16b, #1
+ subs x6, x6, #1
+ b.gt .Lseal_tail_16_store
+
+ // Hash in the final ct block concatenated with extra_in
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+.Lseal_hash_extra:
+ cbz x4, .Lseal_finalize
+
+.Lseal_hash_extra_loop:
+ cmp x4, #16
+ b.lt .Lseal_hash_extra_tail
+ ld1 {v20.16b}, [x3], #16
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #16
+ b .Lseal_hash_extra_loop
+
+.Lseal_hash_extra_tail:
+ cbz x4, .Lseal_finalize
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+ add x3, x3, x4
+
+.Lseal_hash_extra_load:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, #-1]!
+ mov v20.b[0], w11
+ subs x4, x4, #1
+ b.gt .Lseal_hash_extra_load
+
+ // Hash in the final padded extra_in blcok
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+.Lseal_finalize:
+ mov x11, v31.d[0]
+ mov x12, v31.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ // Final reduction step
+ sub x12, xzr, x15
+ orr x13, xzr, #3
+ subs x11, x8, #-5
+ sbcs x12, x9, x12
+ sbcs x13, x10, x13
+ csel x8, x11, x8, cs
+ csel x9, x12, x9, cs
+ csel x10, x13, x10, cs
+ mov x11, v27.d[0]
+ mov x12, v27.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+
+ stp x8, x9, [x5]
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+ ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.Lseal_128:
+ // On some architectures preparing 5 blocks for small buffers is wasteful
+ eor v25.16b, v25.16b, v25.16b
+ mov x11, #1
+ mov v25.s[0], w11
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v17.16b, v30.16b
+ add v15.4s, v17.4s, v25.4s
+ add v16.4s, v15.4s, v25.4s
+
+ mov x6, #10
+
+.Lseal_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x6, x6, #1
+ b.hi .Lseal_128_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+
+ // Only the first 32 bytes of the third block (counter = 0) are needed,
+ // so skip updating v12 and v17.
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+
+ add v30.4s, v30.4s, v25.4s
+ add v15.4s, v15.4s, v30.4s
+ add v30.4s, v30.4s, v25.4s
+ add v16.4s, v16.4s, v30.4s
+
+ and v2.16b, v2.16b, v27.16b
+ mov x16, v2.d[0] // Move the R key to GPRs
+ mov x17, v2.d[1]
+ mov v27.16b, v7.16b // Store the S key
+
+ bl .Lpoly_hash_ad_internal
+ b .Lseal_tail
+.cfi_endproc
+.size chacha20_poly1305_seal,.-chacha20_poly1305_seal
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl chacha20_poly1305_open
+.hidden chacha20_poly1305_open
+.type chacha20_poly1305_open,%function
+.align 6
+chacha20_poly1305_open:
+ AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+ stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+ mov x29, sp
+ // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+ // we don't actually use the frame pointer like that, it's probably not
+ // worth bothering.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+ adrp x11, .Lchacha20_consts
+ add x11, x11, :lo12:.Lchacha20_consts
+
+ ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
+ ld1 {v28.16b - v30.16b}, [x5]
+
+ mov x15, #1 // Prepare the Poly1305 state
+ mov x8, #0
+ mov x9, #0
+ mov x10, #0
+
+ mov v31.d[0], x4 // Store the input and aad lengths
+ mov v31.d[1], x2
+
+ cmp x2, #128
+ b.le .Lopen_128 // Optimization for smaller buffers
+
+ // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+ mov v0.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v15.16b, v30.16b
+
+ mov x6, #10
+
+.align 5
+.Lopen_init_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ subs x6, x6, #1
+ b.hi .Lopen_init_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+
+ and v0.16b, v0.16b, v27.16b
+ mov x16, v0.d[0] // Move the R key to GPRs
+ mov x17, v0.d[1]
+ mov v27.16b, v5.16b // Store the S key
+
+ bl .Lpoly_hash_ad_internal
+
+.Lopen_ad_done:
+ mov x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+.Lopen_main_loop:
+
+ cmp x2, #192
+ b.lt .Lopen_tail
+
+ adrp x11, .Lchacha20_consts
+ add x11, x11, :lo12:.Lchacha20_consts
+
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ sub x5, x5, #32
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+ sub x4, x4, #10
+
+ mov x7, #10
+ subs x6, x7, x4
+ subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+ csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+ cbz x7, .Lopen_main_loop_rounds_short
+
+.align 5
+.Lopen_main_loop_rounds:
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+.Lopen_main_loop_rounds_short:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x7, x7, #1
+ b.gt .Lopen_main_loop_rounds
+ subs x6, x6, #1
+ b.ge .Lopen_main_loop_rounds_short
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #5
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ add v14.4s, v14.4s, v29.4s
+ add v19.4s, v19.4s, v30.4s
+
+ // We can always safely store 192 bytes
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #192
+
+ mov v0.16b, v3.16b
+ mov v5.16b, v8.16b
+ mov v10.16b, v13.16b
+ mov v15.16b, v18.16b
+
+ cmp x2, #64
+ b.lt .Lopen_tail_64_store
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+
+ mov v0.16b, v4.16b
+ mov v5.16b, v9.16b
+ mov v10.16b, v14.16b
+ mov v15.16b, v19.16b
+
+ cmp x2, #64
+ b.lt .Lopen_tail_64_store
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v19.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+ b .Lopen_main_loop
+
+.Lopen_tail:
+
+ cbz x2, .Lopen_finalize
+
+ lsr x4, x2, #4 // How many whole blocks we have to hash
+
+ cmp x2, #64
+ b.le .Lopen_tail_64
+ cmp x2, #128
+ b.le .Lopen_tail_128
+
+.Lopen_tail_192:
+ // We need three more blocks
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v15.16b, v30.16b
+ mov v16.16b, v30.16b
+ mov v17.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ eor v21.16b, v21.16b, v21.16b
+ ins v23.s[0], v25.s[0]
+ ins v21.d[0], x15
+
+ add v22.4s, v23.4s, v21.4s
+ add v21.4s, v22.4s, v21.4s
+
+ add v15.4s, v15.4s, v21.4s
+ add v16.4s, v16.4s, v23.4s
+ add v17.4s, v17.4s, v22.4s
+
+ mov x7, #10
+ subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+ csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+ sub x4, x4, x7
+
+ cbz x7, .Lopen_tail_192_rounds_no_hash
+
+.Lopen_tail_192_rounds:
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+.Lopen_tail_192_rounds_no_hash:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x7, x7, #1
+ b.gt .Lopen_tail_192_rounds
+ subs x6, x6, #1
+ b.ge .Lopen_tail_192_rounds_no_hash
+
+ // We hashed 160 bytes at most, may still have 32 bytes left
+.Lopen_tail_192_hash:
+ cbz x4, .Lopen_tail_192_hash_done
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #1
+ b .Lopen_tail_192_hash
+
+.Lopen_tail_192_hash_done:
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+ add v12.4s, v12.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v15.4s, v15.4s, v21.4s
+ add v16.4s, v16.4s, v23.4s
+ add v17.4s, v17.4s, v22.4s
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #128
+ b .Lopen_tail_64_store
+
+.Lopen_tail_128:
+ // We need two more blocks
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v15.16b, v30.16b
+ mov v16.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ eor v22.16b, v22.16b, v22.16b
+ ins v23.s[0], v25.s[0]
+ ins v22.d[0], x15
+ add v22.4s, v22.4s, v23.4s
+
+ add v15.4s, v15.4s, v22.4s
+ add v16.4s, v16.4s, v23.4s
+
+ mov x6, #10
+ sub x6, x6, x4
+
+.Lopen_tail_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v1.4s, v1.4s, v6.4s
+ eor v16.16b, v16.16b, v1.16b
+ rev32 v16.8h, v16.8h
+
+ add v11.4s, v11.4s, v16.4s
+ eor v6.16b, v6.16b, v11.16b
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ add v1.4s, v1.4s, v20.4s
+ eor v16.16b, v16.16b, v1.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+
+ add v11.4s, v11.4s, v16.4s
+ eor v20.16b, v20.16b, v11.16b
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v16.16b, v16.16b, v16.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ add v1.4s, v1.4s, v6.4s
+ eor v16.16b, v16.16b, v1.16b
+ rev32 v16.8h, v16.8h
+
+ add v11.4s, v11.4s, v16.4s
+ eor v6.16b, v6.16b, v11.16b
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ add v1.4s, v1.4s, v20.4s
+ eor v16.16b, v16.16b, v1.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+
+ add v11.4s, v11.4s, v16.4s
+ eor v20.16b, v20.16b, v11.16b
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v16.16b, v16.16b, v16.16b, #4
+ subs x6, x6, #1
+ b.gt .Lopen_tail_128_rounds
+ cbz x4, .Lopen_tail_128_rounds_done
+ subs x4, x4, #1
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ b .Lopen_tail_128_rounds
+
+.Lopen_tail_128_rounds_done:
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v15.4s, v15.4s, v22.4s
+ add v16.4s, v16.4s, v23.4s
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+ sub x2, x2, #64
+
+ b .Lopen_tail_64_store
+
+.Lopen_tail_64:
+ // We just need a single block
+ mov v0.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v15.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ ins v23.s[0], v25.s[0]
+ add v15.4s, v15.4s, v23.4s
+
+ mov x6, #10
+ sub x6, x6, x4
+
+.Lopen_tail_64_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ subs x6, x6, #1
+ b.gt .Lopen_tail_64_rounds
+ cbz x4, .Lopen_tail_64_rounds_done
+ subs x4, x4, #1
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ b .Lopen_tail_64_rounds
+
+.Lopen_tail_64_rounds_done:
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v15.4s, v15.4s, v23.4s
+
+.Lopen_tail_64_store:
+ cmp x2, #16
+ b.lt .Lopen_tail_16
+
+ ld1 {v20.16b}, [x1], #16
+ eor v20.16b, v20.16b, v0.16b
+ st1 {v20.16b}, [x0], #16
+ mov v0.16b, v5.16b
+ mov v5.16b, v10.16b
+ mov v10.16b, v15.16b
+ sub x2, x2, #16
+ b .Lopen_tail_64_store
+
+.Lopen_tail_16:
+ // Here we handle the last [0,16) bytes that require a padded block
+ cbz x2, .Lopen_finalize
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+ eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+ not v22.16b, v20.16b
+
+ add x7, x1, x2
+ mov x6, x2
+
+.Lopen_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x7, #-1]!
+ mov v20.b[0], w11
+ ext v21.16b, v22.16b, v21.16b, #15
+ subs x2, x2, #1
+ b.gt .Lopen_tail_16_compose
+
+ and v20.16b, v20.16b, v21.16b
+ // Hash in the final padded block
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ eor v20.16b, v20.16b, v0.16b
+
+.Lopen_tail_16_store:
+ umov w11, v20.b[0]
+ strb w11, [x0], #1
+ ext v20.16b, v20.16b, v20.16b, #1
+ subs x6, x6, #1
+ b.gt .Lopen_tail_16_store
+
+.Lopen_finalize:
+ mov x11, v31.d[0]
+ mov x12, v31.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ // Final reduction step
+ sub x12, xzr, x15
+ orr x13, xzr, #3
+ subs x11, x8, #-5
+ sbcs x12, x9, x12
+ sbcs x13, x10, x13
+ csel x8, x11, x8, cs
+ csel x9, x12, x9, cs
+ csel x10, x13, x10, cs
+ mov x11, v27.d[0]
+ mov x12, v27.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+
+ stp x8, x9, [x5]
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+ ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.Lopen_128:
+ // On some architectures preparing 5 blocks for small buffers is wasteful
+ eor v25.16b, v25.16b, v25.16b
+ mov x11, #1
+ mov v25.s[0], w11
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v17.16b, v30.16b
+ add v15.4s, v17.4s, v25.4s
+ add v16.4s, v15.4s, v25.4s
+
+ mov x6, #10
+
+.Lopen_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x6, x6, #1
+ b.hi .Lopen_128_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+
+ add v30.4s, v30.4s, v25.4s
+ add v15.4s, v15.4s, v30.4s
+ add v30.4s, v30.4s, v25.4s
+ add v16.4s, v16.4s, v30.4s
+
+ and v2.16b, v2.16b, v27.16b
+ mov x16, v2.d[0] // Move the R key to GPRs
+ mov x17, v2.d[1]
+ mov v27.16b, v7.16b // Store the S key
+
+ bl .Lpoly_hash_ad_internal
+
+.Lopen_128_store:
+ cmp x2, #64
+ b.lt .Lopen_128_store_64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v22.d[0]
+ mov x12, v22.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v23.d[0]
+ mov x12, v23.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+
+ mov v0.16b, v1.16b
+ mov v5.16b, v6.16b
+ mov v10.16b, v11.16b
+ mov v15.16b, v16.16b
+
+.Lopen_128_store_64:
+
+ lsr x4, x2, #4
+ mov x3, x1
+
+.Lopen_128_hash_64:
+ cbz x4, .Lopen_tail_64_store
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #1
+ b .Lopen_128_hash_64
+.cfi_endproc
+.size chacha20_poly1305_open,.-chacha20_poly1305_open
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/crypto/chacha20_poly1305_armv8-win.S b/gen/crypto/chacha20_poly1305_armv8-win.S
new file mode 100644
index 0000000..3314f2c
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_armv8-win.S
@@ -0,0 +1,3015 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+.section .rodata
+
+.align 7
+Lchacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+Linc:
+.long 1,2,3,4
+Lrol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+Lclamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+
+.text
+
+.def Lpoly_hash_ad_internal
+ .type 32
+.endef
+.align 6
+Lpoly_hash_ad_internal:
+.cfi_startproc
+ cbnz x4, Lpoly_hash_intro
+ ret
+
+Lpoly_hash_intro:
+ cmp x4, #16
+ b.lt Lpoly_hash_ad_tail
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #16
+ b Lpoly_hash_ad_internal
+
+Lpoly_hash_ad_tail:
+ cbz x4, Lpoly_hash_ad_ret
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
+ sub x4, x4, #1
+
+Lpoly_hash_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, x4]
+ mov v20.b[0], w11
+ subs x4, x4, #1
+ b.ge Lpoly_hash_tail_16_compose
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+Lpoly_hash_ad_ret:
+ ret
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
+//
+.globl chacha20_poly1305_seal
+
+.def chacha20_poly1305_seal
+ .type 32
+.endef
+.align 6
+chacha20_poly1305_seal:
+ AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+ stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+ mov x29, sp
+ // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+ // we don't actually use the frame pointer like that, it's probably not
+ // worth bothering.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+ adrp x11, Lchacha20_consts
+ add x11, x11, :lo12:Lchacha20_consts
+
+ ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+ ld1 {v28.16b - v30.16b}, [x5]
+
+ mov x15, #1 // Prepare the Poly1305 state
+ mov x8, #0
+ mov x9, #0
+ mov x10, #0
+
+ ldr x12, [x5, #56] // The total cipher text length includes extra_in_len
+ add x12, x12, x2
+ mov v31.d[0], x4 // Store the input and aad lengths
+ mov v31.d[1], x12
+
+ cmp x2, #128
+ b.le Lseal_128 // Optimization for smaller buffers
+
+ // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
+ // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
+ // the fifth block (A4-D4) horizontally.
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ sub x5, x5, #32
+
+ mov x6, #10
+
+.align 5
+Lseal_init_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x6, x6, #1
+ b.hi Lseal_init_rounds
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #4
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ and v4.16b, v4.16b, v27.16b
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ mov x16, v4.d[0] // Move the R key to GPRs
+ mov x17, v4.d[1]
+ mov v27.16b, v9.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+
+ mov x3, x0
+ cmp x2, #256
+ b.le Lseal_tail
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #256
+
+ mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
+ mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
+
+Lseal_main_loop:
+ adrp x11, Lchacha20_consts
+ add x11, x11, :lo12:Lchacha20_consts
+
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ sub x5, x5, #32
+.align 5
+Lseal_main_loop_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x6, x6, #1
+ b.ge Lseal_main_loop_rounds
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ subs x7, x7, #1
+ b.gt Lseal_main_loop_rounds
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #5
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ add v14.4s, v14.4s, v29.4s
+ add v19.4s, v19.4s, v30.4s
+
+ cmp x2, #320
+ b.le Lseal_tail
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v19.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #320
+
+ mov x6, #0
+ mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
+
+ b Lseal_main_loop
+
+Lseal_tail:
+ // This part of the function handles the storage and authentication of the last [0,320) bytes
+ // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
+ cmp x2, #64
+ b.lt Lseal_tail_64
+
+ // Store and authenticate 64B blocks per iteration
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v22.d[0]
+ mov x12, v22.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v23.d[0]
+ mov x12, v23.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ st1 {v20.16b - v23.16b}, [x0], #64
+ sub x2, x2, #64
+
+ // Shift the state left by 64 bytes for the next iteration of the loop
+ mov v0.16b, v1.16b
+ mov v5.16b, v6.16b
+ mov v10.16b, v11.16b
+ mov v15.16b, v16.16b
+
+ mov v1.16b, v2.16b
+ mov v6.16b, v7.16b
+ mov v11.16b, v12.16b
+ mov v16.16b, v17.16b
+
+ mov v2.16b, v3.16b
+ mov v7.16b, v8.16b
+ mov v12.16b, v13.16b
+ mov v17.16b, v18.16b
+
+ mov v3.16b, v4.16b
+ mov v8.16b, v9.16b
+ mov v13.16b, v14.16b
+ mov v18.16b, v19.16b
+
+ b Lseal_tail
+
+Lseal_tail_64:
+ ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
+
+ // Here we handle the last [0,64) bytes of plaintext
+ cmp x2, #16
+ b.lt Lseal_tail_16
+ // Each iteration encrypt and authenticate a 16B block
+ ld1 {v20.16b}, [x1], #16
+ eor v20.16b, v20.16b, v0.16b
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ st1 {v20.16b}, [x0], #16
+
+ sub x2, x2, #16
+
+ // Shift the state left by 16 bytes for the next iteration of the loop
+ mov v0.16b, v5.16b
+ mov v5.16b, v10.16b
+ mov v10.16b, v15.16b
+
+ b Lseal_tail_64
+
+Lseal_tail_16:
+ // Here we handle the last [0,16) bytes of ciphertext that require a padded block
+ cbz x2, Lseal_hash_extra
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
+ eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
+ not v22.16b, v20.16b
+
+ mov x6, x2
+ add x1, x1, x2
+
+ cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
+
+ mov x7, #16 // We need to load some extra_in first for padding
+ sub x7, x7, x2
+ cmp x4, x7
+ csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
+ mov x12, x7
+ add x3, x3, x7
+ sub x4, x4, x7
+
+Lseal_tail16_compose_extra_in:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, #-1]!
+ mov v20.b[0], w11
+ subs x7, x7, #1
+ b.gt Lseal_tail16_compose_extra_in
+
+ add x3, x3, x12
+
+Lseal_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x1, #-1]!
+ mov v20.b[0], w11
+ ext v21.16b, v22.16b, v21.16b, #15
+ subs x2, x2, #1
+ b.gt Lseal_tail_16_compose
+
+ and v0.16b, v0.16b, v21.16b
+ eor v20.16b, v20.16b, v0.16b
+ mov v21.16b, v20.16b
+
+Lseal_tail_16_store:
+ umov w11, v20.b[0]
+ strb w11, [x0], #1
+ ext v20.16b, v20.16b, v20.16b, #1
+ subs x6, x6, #1
+ b.gt Lseal_tail_16_store
+
+ // Hash in the final ct block concatenated with extra_in
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+Lseal_hash_extra:
+ cbz x4, Lseal_finalize
+
+Lseal_hash_extra_loop:
+ cmp x4, #16
+ b.lt Lseal_hash_extra_tail
+ ld1 {v20.16b}, [x3], #16
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #16
+ b Lseal_hash_extra_loop
+
+Lseal_hash_extra_tail:
+ cbz x4, Lseal_finalize
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
+ add x3, x3, x4
+
+Lseal_hash_extra_load:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x3, #-1]!
+ mov v20.b[0], w11
+ subs x4, x4, #1
+ b.gt Lseal_hash_extra_load
+
+ // Hash in the final padded extra_in blcok
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+Lseal_finalize:
+ mov x11, v31.d[0]
+ mov x12, v31.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ // Final reduction step
+ sub x12, xzr, x15
+ orr x13, xzr, #3
+ subs x11, x8, #-5
+ sbcs x12, x9, x12
+ sbcs x13, x10, x13
+ csel x8, x11, x8, cs
+ csel x9, x12, x9, cs
+ csel x10, x13, x10, cs
+ mov x11, v27.d[0]
+ mov x12, v27.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+
+ stp x8, x9, [x5]
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+ ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+Lseal_128:
+ // On some architectures preparing 5 blocks for small buffers is wasteful
+ eor v25.16b, v25.16b, v25.16b
+ mov x11, #1
+ mov v25.s[0], w11
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v17.16b, v30.16b
+ add v15.4s, v17.4s, v25.4s
+ add v16.4s, v15.4s, v25.4s
+
+ mov x6, #10
+
+Lseal_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x6, x6, #1
+ b.hi Lseal_128_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+
+ // Only the first 32 bytes of the third block (counter = 0) are needed,
+ // so skip updating v12 and v17.
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+
+ add v30.4s, v30.4s, v25.4s
+ add v15.4s, v15.4s, v30.4s
+ add v30.4s, v30.4s, v25.4s
+ add v16.4s, v16.4s, v30.4s
+
+ and v2.16b, v2.16b, v27.16b
+ mov x16, v2.d[0] // Move the R key to GPRs
+ mov x17, v2.d[1]
+ mov v27.16b, v7.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+ b Lseal_tail
+.cfi_endproc
+
+
+/////////////////////////////////
+//
+// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
+//
+.globl chacha20_poly1305_open
+
+.def chacha20_poly1305_open
+ .type 32
+.endef
+.align 6
+chacha20_poly1305_open:
+ AARCH64_SIGN_LINK_REGISTER
+.cfi_startproc
+ stp x29, x30, [sp, #-80]!
+.cfi_def_cfa_offset 80
+.cfi_offset w30, -72
+.cfi_offset w29, -80
+ mov x29, sp
+ // We probably could do .cfi_def_cfa w29, 80 at this point, but since
+ // we don't actually use the frame pointer like that, it's probably not
+ // worth bothering.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+.cfi_offset b15, -8
+.cfi_offset b14, -16
+.cfi_offset b13, -24
+.cfi_offset b12, -32
+.cfi_offset b11, -40
+.cfi_offset b10, -48
+.cfi_offset b9, -56
+.cfi_offset b8, -64
+
+ adrp x11, Lchacha20_consts
+ add x11, x11, :lo12:Lchacha20_consts
+
+ ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
+ ld1 {v28.16b - v30.16b}, [x5]
+
+ mov x15, #1 // Prepare the Poly1305 state
+ mov x8, #0
+ mov x9, #0
+ mov x10, #0
+
+ mov v31.d[0], x4 // Store the input and aad lengths
+ mov v31.d[1], x2
+
+ cmp x2, #128
+ b.le Lopen_128 // Optimization for smaller buffers
+
+ // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
+ mov v0.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v15.16b, v30.16b
+
+ mov x6, #10
+
+.align 5
+Lopen_init_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ subs x6, x6, #1
+ b.hi Lopen_init_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+
+ and v0.16b, v0.16b, v27.16b
+ mov x16, v0.d[0] // Move the R key to GPRs
+ mov x17, v0.d[1]
+ mov v27.16b, v5.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+
+Lopen_ad_done:
+ mov x3, x1
+
+// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
+Lopen_main_loop:
+
+ cmp x2, #192
+ b.lt Lopen_tail
+
+ adrp x11, Lchacha20_consts
+ add x11, x11, :lo12:Lchacha20_consts
+
+ ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
+ mov v4.16b, v24.16b
+
+ ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
+ mov v9.16b, v28.16b
+
+ ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
+ mov v14.16b, v29.16b
+
+ ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
+ sub x5, x5, #32
+ add v15.4s, v15.4s, v25.4s
+ mov v19.16b, v30.16b
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
+ sub x4, x4, #10
+
+ mov x7, #10
+ subs x6, x7, x4
+ subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
+ csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
+
+ cbz x7, Lopen_main_loop_rounds_short
+
+.align 5
+Lopen_main_loop_rounds:
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+Lopen_main_loop_rounds_short:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v18.8h, v18.8h
+ rev32 v19.8h, v19.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v9.4s, #20
+ sli v8.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+ add v4.4s, v4.4s, v8.4s
+
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ add v13.4s, v13.4s, v18.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v14.16b
+
+ ushr v9.4s, v8.4s, #25
+ sli v9.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #4
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #12
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ add v0.4s, v0.4s, v6.4s
+ add v1.4s, v1.4s, v7.4s
+ add v2.4s, v2.4s, v8.4s
+ add v3.4s, v3.4s, v5.4s
+ add v4.4s, v4.4s, v9.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ rev32 v18.8h, v18.8h
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+ rev32 v19.8h, v19.8h
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v6.16b, v6.16b, v12.16b
+ eor v7.16b, v7.16b, v13.16b
+ eor v8.16b, v8.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v9.16b, v9.16b, v14.16b
+
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+ ushr v7.4s, v8.4s, #20
+ sli v7.4s, v8.4s, #12
+ ushr v8.4s, v5.4s, #20
+ sli v8.4s, v5.4s, #12
+ ushr v5.4s, v9.4s, #20
+ sli v5.4s, v9.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ add v3.4s, v3.4s, v8.4s
+ add v4.4s, v4.4s, v5.4s
+
+ eor v18.16b, v18.16b, v0.16b
+ eor v15.16b, v15.16b, v1.16b
+ eor v16.16b, v16.16b, v2.16b
+ eor v17.16b, v17.16b, v3.16b
+ eor v19.16b, v19.16b, v4.16b
+
+ tbl v18.16b, {v18.16b}, v26.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+ tbl v19.16b, {v19.16b}, v26.16b
+
+ add v12.4s, v12.4s, v18.4s
+ add v13.4s, v13.4s, v15.4s
+ add v10.4s, v10.4s, v16.4s
+ add v11.4s, v11.4s, v17.4s
+ add v14.4s, v14.4s, v19.4s
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v6.16b, v6.16b, v13.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v8.16b, v8.16b, v11.16b
+ eor v5.16b, v5.16b, v14.16b
+
+ ushr v9.4s, v5.4s, #25
+ sli v9.4s, v5.4s, #7
+ ushr v5.4s, v8.4s, #25
+ sli v5.4s, v8.4s, #7
+ ushr v8.4s, v7.4s, #25
+ sli v8.4s, v7.4s, #7
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+
+ ext v9.16b, v9.16b, v9.16b, #12
+ ext v14.16b, v14.16b, v14.16b, #8
+ ext v19.16b, v19.16b, v19.16b, #4
+ subs x7, x7, #1
+ b.gt Lopen_main_loop_rounds
+ subs x6, x6, #1
+ b.ge Lopen_main_loop_rounds_short
+
+ eor v20.16b, v20.16b, v20.16b //zero
+ not v21.16b, v20.16b // -1
+ sub v21.4s, v25.4s, v21.4s // Add +1
+ ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
+ add v19.4s, v19.4s, v20.4s
+
+ add v15.4s, v15.4s, v25.4s
+ mov x11, #5
+ dup v20.4s, w11
+ add v25.4s, v25.4s, v20.4s
+
+ zip1 v20.4s, v0.4s, v1.4s
+ zip2 v21.4s, v0.4s, v1.4s
+ zip1 v22.4s, v2.4s, v3.4s
+ zip2 v23.4s, v2.4s, v3.4s
+
+ zip1 v0.2d, v20.2d, v22.2d
+ zip2 v1.2d, v20.2d, v22.2d
+ zip1 v2.2d, v21.2d, v23.2d
+ zip2 v3.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v5.4s, v6.4s
+ zip2 v21.4s, v5.4s, v6.4s
+ zip1 v22.4s, v7.4s, v8.4s
+ zip2 v23.4s, v7.4s, v8.4s
+
+ zip1 v5.2d, v20.2d, v22.2d
+ zip2 v6.2d, v20.2d, v22.2d
+ zip1 v7.2d, v21.2d, v23.2d
+ zip2 v8.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v10.4s, v11.4s
+ zip2 v21.4s, v10.4s, v11.4s
+ zip1 v22.4s, v12.4s, v13.4s
+ zip2 v23.4s, v12.4s, v13.4s
+
+ zip1 v10.2d, v20.2d, v22.2d
+ zip2 v11.2d, v20.2d, v22.2d
+ zip1 v12.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+
+ zip1 v20.4s, v15.4s, v16.4s
+ zip2 v21.4s, v15.4s, v16.4s
+ zip1 v22.4s, v17.4s, v18.4s
+ zip2 v23.4s, v17.4s, v18.4s
+
+ zip1 v15.2d, v20.2d, v22.2d
+ zip2 v16.2d, v20.2d, v22.2d
+ zip1 v17.2d, v21.2d, v23.2d
+ zip2 v18.2d, v21.2d, v23.2d
+
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+
+ add v1.4s, v1.4s, v24.4s
+ add v6.4s, v6.4s, v28.4s
+ add v11.4s, v11.4s, v29.4s
+ add v16.4s, v16.4s, v30.4s
+
+ add v2.4s, v2.4s, v24.4s
+ add v7.4s, v7.4s, v28.4s
+ add v12.4s, v12.4s, v29.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v3.4s, v3.4s, v24.4s
+ add v8.4s, v8.4s, v28.4s
+ add v13.4s, v13.4s, v29.4s
+ add v18.4s, v18.4s, v30.4s
+
+ add v4.4s, v4.4s, v24.4s
+ add v9.4s, v9.4s, v28.4s
+ add v14.4s, v14.4s, v29.4s
+ add v19.4s, v19.4s, v30.4s
+
+ // We can always safely store 192 bytes
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #192
+
+ mov v0.16b, v3.16b
+ mov v5.16b, v8.16b
+ mov v10.16b, v13.16b
+ mov v15.16b, v18.16b
+
+ cmp x2, #64
+ b.lt Lopen_tail_64_store
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v22.16b, v22.16b, v13.16b
+ eor v23.16b, v23.16b, v18.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+
+ mov v0.16b, v4.16b
+ mov v5.16b, v9.16b
+ mov v10.16b, v14.16b
+ mov v15.16b, v19.16b
+
+ cmp x2, #64
+ b.lt Lopen_tail_64_store
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v19.16b
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+ b Lopen_main_loop
+
+Lopen_tail:
+
+ cbz x2, Lopen_finalize
+
+ lsr x4, x2, #4 // How many whole blocks we have to hash
+
+ cmp x2, #64
+ b.le Lopen_tail_64
+ cmp x2, #128
+ b.le Lopen_tail_128
+
+Lopen_tail_192:
+ // We need three more blocks
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v15.16b, v30.16b
+ mov v16.16b, v30.16b
+ mov v17.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ eor v21.16b, v21.16b, v21.16b
+ ins v23.s[0], v25.s[0]
+ ins v21.d[0], x15
+
+ add v22.4s, v23.4s, v21.4s
+ add v21.4s, v22.4s, v21.4s
+
+ add v15.4s, v15.4s, v21.4s
+ add v16.4s, v16.4s, v23.4s
+ add v17.4s, v17.4s, v22.4s
+
+ mov x7, #10
+ subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
+ csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
+ sub x4, x4, x7
+
+ cbz x7, Lopen_tail_192_rounds_no_hash
+
+Lopen_tail_192_rounds:
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+Lopen_tail_192_rounds_no_hash:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x7, x7, #1
+ b.gt Lopen_tail_192_rounds
+ subs x6, x6, #1
+ b.ge Lopen_tail_192_rounds_no_hash
+
+ // We hashed 160 bytes at most, may still have 32 bytes left
+Lopen_tail_192_hash:
+ cbz x4, Lopen_tail_192_hash_done
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #1
+ b Lopen_tail_192_hash
+
+Lopen_tail_192_hash_done:
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+ add v12.4s, v12.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+
+ add v15.4s, v15.4s, v21.4s
+ add v16.4s, v16.4s, v23.4s
+ add v17.4s, v17.4s, v22.4s
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v2.16b
+ eor v21.16b, v21.16b, v7.16b
+ eor v22.16b, v22.16b, v12.16b
+ eor v23.16b, v23.16b, v17.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #128
+ b Lopen_tail_64_store
+
+Lopen_tail_128:
+ // We need two more blocks
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v15.16b, v30.16b
+ mov v16.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ eor v22.16b, v22.16b, v22.16b
+ ins v23.s[0], v25.s[0]
+ ins v22.d[0], x15
+ add v22.4s, v22.4s, v23.4s
+
+ add v15.4s, v15.4s, v22.4s
+ add v16.4s, v16.4s, v23.4s
+
+ mov x6, #10
+ sub x6, x6, x4
+
+Lopen_tail_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v1.4s, v1.4s, v6.4s
+ eor v16.16b, v16.16b, v1.16b
+ rev32 v16.8h, v16.8h
+
+ add v11.4s, v11.4s, v16.4s
+ eor v6.16b, v6.16b, v11.16b
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ add v1.4s, v1.4s, v20.4s
+ eor v16.16b, v16.16b, v1.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+
+ add v11.4s, v11.4s, v16.4s
+ eor v20.16b, v20.16b, v11.16b
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v16.16b, v16.16b, v16.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ add v1.4s, v1.4s, v6.4s
+ eor v16.16b, v16.16b, v1.16b
+ rev32 v16.8h, v16.8h
+
+ add v11.4s, v11.4s, v16.4s
+ eor v6.16b, v6.16b, v11.16b
+ ushr v20.4s, v6.4s, #20
+ sli v20.4s, v6.4s, #12
+ add v1.4s, v1.4s, v20.4s
+ eor v16.16b, v16.16b, v1.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+
+ add v11.4s, v11.4s, v16.4s
+ eor v20.16b, v20.16b, v11.16b
+ ushr v6.4s, v20.4s, #25
+ sli v6.4s, v20.4s, #7
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v16.16b, v16.16b, v16.16b, #4
+ subs x6, x6, #1
+ b.gt Lopen_tail_128_rounds
+ cbz x4, Lopen_tail_128_rounds_done
+ subs x4, x4, #1
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ b Lopen_tail_128_rounds
+
+Lopen_tail_128_rounds_done:
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v15.4s, v15.4s, v22.4s
+ add v16.4s, v16.4s, v23.4s
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ eor v20.16b, v20.16b, v1.16b
+ eor v21.16b, v21.16b, v6.16b
+ eor v22.16b, v22.16b, v11.16b
+ eor v23.16b, v23.16b, v16.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+ sub x2, x2, #64
+
+ b Lopen_tail_64_store
+
+Lopen_tail_64:
+ // We just need a single block
+ mov v0.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v15.16b, v30.16b
+ eor v23.16b, v23.16b, v23.16b
+ ins v23.s[0], v25.s[0]
+ add v15.4s, v15.4s, v23.4s
+
+ mov x6, #10
+ sub x6, x6, x4
+
+Lopen_tail_64_rounds:
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ eor v15.16b, v15.16b, v0.16b
+ rev32 v15.8h, v15.8h
+
+ add v10.4s, v10.4s, v15.4s
+ eor v5.16b, v5.16b, v10.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ add v0.4s, v0.4s, v20.4s
+ eor v15.16b, v15.16b, v0.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ eor v20.16b, v20.16b, v10.16b
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v15.16b, v15.16b, v15.16b, #4
+ subs x6, x6, #1
+ b.gt Lopen_tail_64_rounds
+ cbz x4, Lopen_tail_64_rounds_done
+ subs x4, x4, #1
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ b Lopen_tail_64_rounds
+
+Lopen_tail_64_rounds_done:
+ add v0.4s, v0.4s, v24.4s
+ add v5.4s, v5.4s, v28.4s
+ add v10.4s, v10.4s, v29.4s
+ add v15.4s, v15.4s, v30.4s
+ add v15.4s, v15.4s, v23.4s
+
+Lopen_tail_64_store:
+ cmp x2, #16
+ b.lt Lopen_tail_16
+
+ ld1 {v20.16b}, [x1], #16
+ eor v20.16b, v20.16b, v0.16b
+ st1 {v20.16b}, [x0], #16
+ mov v0.16b, v5.16b
+ mov v5.16b, v10.16b
+ mov v10.16b, v15.16b
+ sub x2, x2, #16
+ b Lopen_tail_64_store
+
+Lopen_tail_16:
+ // Here we handle the last [0,16) bytes that require a padded block
+ cbz x2, Lopen_finalize
+
+ eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
+ eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
+ not v22.16b, v20.16b
+
+ add x7, x1, x2
+ mov x6, x2
+
+Lopen_tail_16_compose:
+ ext v20.16b, v20.16b, v20.16b, #15
+ ldrb w11, [x7, #-1]!
+ mov v20.b[0], w11
+ ext v21.16b, v22.16b, v21.16b, #15
+ subs x2, x2, #1
+ b.gt Lopen_tail_16_compose
+
+ and v20.16b, v20.16b, v21.16b
+ // Hash in the final padded block
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ eor v20.16b, v20.16b, v0.16b
+
+Lopen_tail_16_store:
+ umov w11, v20.b[0]
+ strb w11, [x0], #1
+ ext v20.16b, v20.16b, v20.16b, #1
+ subs x6, x6, #1
+ b.gt Lopen_tail_16_store
+
+Lopen_finalize:
+ mov x11, v31.d[0]
+ mov x12, v31.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ // Final reduction step
+ sub x12, xzr, x15
+ orr x13, xzr, #3
+ subs x11, x8, #-5
+ sbcs x12, x9, x12
+ sbcs x13, x10, x13
+ csel x8, x11, x8, cs
+ csel x9, x12, x9, cs
+ csel x10, x13, x10, cs
+ mov x11, v27.d[0]
+ mov x12, v27.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+
+ stp x8, x9, [x5]
+
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+.cfi_restore b15
+.cfi_restore b14
+.cfi_restore b13
+.cfi_restore b12
+.cfi_restore b11
+.cfi_restore b10
+.cfi_restore b9
+.cfi_restore b8
+ ldp x29, x30, [sp], 80
+.cfi_restore w29
+.cfi_restore w30
+.cfi_def_cfa_offset 0
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+Lopen_128:
+ // On some architectures preparing 5 blocks for small buffers is wasteful
+ eor v25.16b, v25.16b, v25.16b
+ mov x11, #1
+ mov v25.s[0], w11
+ mov v0.16b, v24.16b
+ mov v1.16b, v24.16b
+ mov v2.16b, v24.16b
+ mov v5.16b, v28.16b
+ mov v6.16b, v28.16b
+ mov v7.16b, v28.16b
+ mov v10.16b, v29.16b
+ mov v11.16b, v29.16b
+ mov v12.16b, v29.16b
+ mov v17.16b, v30.16b
+ add v15.4s, v17.4s, v25.4s
+ add v16.4s, v15.4s, v25.4s
+
+ mov x6, #10
+
+Lopen_128_rounds:
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #4
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v0.4s, v0.4s, v5.4s
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ rev32 v15.8h, v15.8h
+ rev32 v16.8h, v16.8h
+ rev32 v17.8h, v17.8h
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v5.16b, v5.16b, v10.16b
+ eor v6.16b, v6.16b, v11.16b
+ eor v7.16b, v7.16b, v12.16b
+ ushr v20.4s, v5.4s, #20
+ sli v20.4s, v5.4s, #12
+ ushr v5.4s, v6.4s, #20
+ sli v5.4s, v6.4s, #12
+ ushr v6.4s, v7.4s, #20
+ sli v6.4s, v7.4s, #12
+
+ add v0.4s, v0.4s, v20.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v15.16b, v15.16b, v0.16b
+ eor v16.16b, v16.16b, v1.16b
+ eor v17.16b, v17.16b, v2.16b
+ tbl v15.16b, {v15.16b}, v26.16b
+ tbl v16.16b, {v16.16b}, v26.16b
+ tbl v17.16b, {v17.16b}, v26.16b
+
+ add v10.4s, v10.4s, v15.4s
+ add v11.4s, v11.4s, v16.4s
+ add v12.4s, v12.4s, v17.4s
+ eor v20.16b, v20.16b, v10.16b
+ eor v5.16b, v5.16b, v11.16b
+ eor v6.16b, v6.16b, v12.16b
+ ushr v7.4s, v6.4s, #25
+ sli v7.4s, v6.4s, #7
+ ushr v6.4s, v5.4s, #25
+ sli v6.4s, v5.4s, #7
+ ushr v5.4s, v20.4s, #25
+ sli v5.4s, v20.4s, #7
+
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+
+ ext v10.16b, v10.16b, v10.16b, #8
+ ext v11.16b, v11.16b, v11.16b, #8
+ ext v12.16b, v12.16b, v12.16b, #8
+
+ ext v15.16b, v15.16b, v15.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #4
+ ext v17.16b, v17.16b, v17.16b, #4
+ subs x6, x6, #1
+ b.hi Lopen_128_rounds
+
+ add v0.4s, v0.4s, v24.4s
+ add v1.4s, v1.4s, v24.4s
+ add v2.4s, v2.4s, v24.4s
+
+ add v5.4s, v5.4s, v28.4s
+ add v6.4s, v6.4s, v28.4s
+ add v7.4s, v7.4s, v28.4s
+
+ add v10.4s, v10.4s, v29.4s
+ add v11.4s, v11.4s, v29.4s
+
+ add v30.4s, v30.4s, v25.4s
+ add v15.4s, v15.4s, v30.4s
+ add v30.4s, v30.4s, v25.4s
+ add v16.4s, v16.4s, v30.4s
+
+ and v2.16b, v2.16b, v27.16b
+ mov x16, v2.d[0] // Move the R key to GPRs
+ mov x17, v2.d[1]
+ mov v27.16b, v7.16b // Store the S key
+
+ bl Lpoly_hash_ad_internal
+
+Lopen_128_store:
+ cmp x2, #64
+ b.lt Lopen_128_store_64
+
+ ld1 {v20.16b - v23.16b}, [x1], #64
+
+ mov x11, v20.d[0]
+ mov x12, v20.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v21.d[0]
+ mov x12, v21.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v22.d[0]
+ mov x12, v22.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ mov x11, v23.d[0]
+ mov x12, v23.d[1]
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+
+ eor v20.16b, v20.16b, v0.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v15.16b
+
+ st1 {v20.16b - v23.16b}, [x0], #64
+
+ sub x2, x2, #64
+
+ mov v0.16b, v1.16b
+ mov v5.16b, v6.16b
+ mov v10.16b, v11.16b
+ mov v15.16b, v16.16b
+
+Lopen_128_store_64:
+
+ lsr x4, x2, #4
+ mov x3, x1
+
+Lopen_128_hash_64:
+ cbz x4, Lopen_tail_64_store
+ ldp x11, x12, [x3], 16
+ adds x8, x8, x11
+ adcs x9, x9, x12
+ adc x10, x10, x15
+ mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0
+ umulh x12, x8, x16
+ mul x13, x9, x16
+ umulh x14, x9, x16
+ adds x12, x12, x13
+ mul x13, x10, x16
+ adc x13, x13, x14
+ mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
+ umulh x8, x8, x17
+ adds x12, x12, x14
+ mul x14, x9, x17
+ umulh x9, x9, x17
+ adcs x14, x14, x8
+ mul x10, x10, x17
+ adc x10, x10, x9
+ adds x13, x13, x14
+ adc x14, x10, xzr
+ and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3)
+ and x8, x13, #-4
+ extr x13, x14, x13, #2
+ adds x8, x8, x11
+ lsr x11, x14, #2
+ adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits
+ adds x8, x8, x13
+ adcs x9, x9, x12
+ adc x10, x10, xzr // At this point acc2 has the value of 4 at most
+ sub x4, x4, #1
+ b Lopen_128_hash_64
+.cfi_endproc
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/crypto/chacha20_poly1305_x86_64-apple.S b/gen/crypto/chacha20_poly1305_x86_64-apple.S
new file mode 100644
index 0000000..e4a7202
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_x86_64-apple.S
@@ -0,0 +1,8875 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+chacha20_poly1305_constants:
+
+.section __DATA,__const
+.p2align 6
+L$chacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+L$rol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+L$rol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+L$avx2_init:
+.long 0,0,0,0
+L$sse_inc:
+.long 1,0,0,0
+L$avx2_inc:
+.long 2,0,0,0,2,0,0,0
+L$clamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.p2align 4
+L$and_masks:
+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.text
+
+
+.p2align 6
+poly_hash_ad_internal:
+
+
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ cmpq $13,%r8
+ jne L$hash_ad_loop
+L$poly_fast_tls_ad:
+
+ movq (%rcx),%r10
+ movq 5(%rcx),%r11
+ shrq $24,%r11
+ movq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ ret
+L$hash_ad_loop:
+
+ cmpq $16,%r8
+ jb L$hash_ad_tail
+ addq 0+0(%rcx),%r10
+ adcq 8+0(%rcx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rcx),%rcx
+ subq $16,%r8
+ jmp L$hash_ad_loop
+L$hash_ad_tail:
+ cmpq $0,%r8
+ je L$hash_ad_done
+
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ addq %r8,%rcx
+L$hash_ad_tail_loop:
+ shldq $8,%r13,%r14
+ shlq $8,%r13
+ movzbq -1(%rcx),%r15
+ xorq %r15,%r13
+ decq %rcx
+ decq %r8
+ jne L$hash_ad_tail_loop
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+L$hash_ad_done:
+ ret
+
+
+
+.globl _chacha20_poly1305_open
+.private_extern _chacha20_poly1305_open
+
+.p2align 6
+_chacha20_poly1305_open:
+
+_CET_ENDBR
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+
+
+ pushq %r9
+
+ subq $288 + 0 + 32,%rsp
+
+
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+
+ movq %rdx,%rbx
+ movq %r8,0+0+32(%rbp)
+ movq %rbx,8+0+32(%rbp)
+
+ movl _OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_open_avx2
+
+ cmpq $128,%rbx
+ jbe L$open_sse_128
+
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+
+ movdqa %xmm12,%xmm7
+
+ movdqa %xmm4,0+48(%rbp)
+ movdqa %xmm8,0+64(%rbp)
+ movdqa %xmm12,0+96(%rbp)
+ movq $10,%r10
+L$open_sse_init_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jne L$open_sse_init_rounds
+
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+
+ pand L$clamp(%rip),%xmm0
+ movdqa %xmm0,0+0(%rbp)
+ movdqa %xmm4,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+L$open_sse_main_loop:
+ cmpq $256,%rbx
+ jb L$open_sse_tail
+
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd L$sse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd L$sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+
+
+ movq $4,%rcx
+ movq %rsi,%r8
+L$open_sse_main_loop_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa L$rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+
+ leaq 16(%r8),%r8
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa L$rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa L$rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa L$rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %rcx
+ jge L$open_sse_main_loop_rounds
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ cmpq $-6,%rcx
+ jg L$open_sse_main_loop_rounds
+ paddd L$chacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqa %xmm12,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor 0+80(%rbp),%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp L$open_sse_main_loop
+L$open_sse_tail:
+
+ testq %rbx,%rbx
+ jz L$open_sse_finalize
+ cmpq $192,%rbx
+ ja L$open_sse_tail_256
+ cmpq $128,%rbx
+ ja L$open_sse_tail_192
+ cmpq $64,%rbx
+ ja L$open_sse_tail_128
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa 0+96(%rbp),%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ cmpq $16,%rcx
+ jb L$open_sse_tail_64_rounds
+L$open_sse_tail_64_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+L$open_sse_tail_64_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ cmpq $16,%rcx
+ jae L$open_sse_tail_64_rounds_and_x1hash
+ cmpq $160,%r8
+ jne L$open_sse_tail_64_rounds
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ jmp L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_128:
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 0+96(%rbp),%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+L$open_sse_tail_128_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+L$open_sse_tail_128_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ cmpq %rcx,%r8
+ jb L$open_sse_tail_128_rounds_and_x1hash
+ cmpq $160,%r8
+ jne L$open_sse_tail_128_rounds
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jmp L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_192:
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 0+96(%rbp),%xmm14
+ paddd L$sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+
+ movq %rbx,%rcx
+ movq $160,%r8
+ cmpq $160,%rcx
+ cmovgq %r8,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+L$open_sse_tail_192_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+L$open_sse_tail_192_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ cmpq %rcx,%r8
+ jb L$open_sse_tail_192_rounds_and_x1hash
+ cmpq $160,%r8
+ jne L$open_sse_tail_192_rounds
+ cmpq $176,%rbx
+ jb L$open_sse_tail_192_finish
+ addq 0+160(%rsi),%r10
+ adcq 8+160(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ cmpq $192,%rbx
+ jb L$open_sse_tail_192_finish
+ addq 0+176(%rsi),%r10
+ adcq 8+176(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+L$open_sse_tail_192_finish:
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ jmp L$open_sse_tail_64_dec_loop
+
+L$open_sse_tail_256:
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd L$sse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd L$sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+ xorq %r8,%r8
+L$open_sse_tail_256_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movdqa %xmm11,0+80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ movdqa 0+80(%rbp),%xmm11
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa %xmm9,0+80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb L$rol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb L$rol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+ movdqa 0+80(%rbp),%xmm9
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ movdqa %xmm11,0+80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+ movdqa 0+80(%rbp),%xmm11
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ movdqa %xmm9,0+80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb L$rol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb L$rol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+ movdqa 0+80(%rbp),%xmm9
+
+ addq $16,%r8
+ cmpq $160,%r8
+ jb L$open_sse_tail_256_rounds_and_x1hash
+
+ movq %rbx,%rcx
+ andq $-16,%rcx
+L$open_sse_tail_256_hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%r8
+ cmpq %rcx,%r8
+ jb L$open_sse_tail_256_hash
+ paddd L$chacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqa %xmm12,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movdqa 0+80(%rbp),%xmm12
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ leaq 192(%rdi),%rdi
+
+
+L$open_sse_tail_64_dec_loop:
+ cmpq $16,%rbx
+ jb L$open_sse_tail_16_init
+ subq $16,%rbx
+ movdqu (%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ jmp L$open_sse_tail_64_dec_loop
+L$open_sse_tail_16_init:
+ movdqa %xmm0,%xmm1
+
+
+L$open_sse_tail_16:
+ testq %rbx,%rbx
+ jz L$open_sse_finalize
+
+
+
+ pxor %xmm3,%xmm3
+ leaq -1(%rsi,%rbx,1),%rsi
+ movq %rbx,%r8
+L$open_sse_tail_16_compose:
+ pslldq $1,%xmm3
+ pinsrb $0,(%rsi),%xmm3
+ subq $1,%rsi
+ subq $1,%r8
+ jnz L$open_sse_tail_16_compose
+
+.byte 102,73,15,126,221
+ pextrq $1,%xmm3,%r14
+
+ pxor %xmm1,%xmm3
+
+
+L$open_sse_tail_16_extract:
+ pextrb $0,%xmm3,(%rdi)
+ psrldq $1,%xmm3
+ addq $1,%rdi
+ subq $1,%rbx
+ jne L$open_sse_tail_16_extract
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+L$open_sse_finalize:
+ addq 0+0+32(%rbp),%r10
+ adcq 8+0+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+0+16(%rbp),%r10
+ adcq 8+0+16(%rbp),%r11
+
+
+ addq $288 + 0 + 32,%rsp
+
+
+ popq %r9
+
+ movq %r10,(%r9)
+ movq %r11,8(%r9)
+ popq %r15
+
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ popq %rbx
+
+ popq %rbp
+
+ ret
+
+L$open_sse_128:
+
+ movdqu L$chacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm14
+ paddd L$sse_inc(%rip),%xmm14
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm13,%xmm15
+ movq $10,%r10
+
+L$open_sse_128_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz L$open_sse_128_rounds
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm9
+ paddd %xmm11,%xmm10
+ paddd %xmm15,%xmm13
+ paddd L$sse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm14
+
+ pand L$clamp(%rip),%xmm0
+ movdqa %xmm0,0+0(%rbp)
+ movdqa %xmm4,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+L$open_sse_128_xor_hash:
+ cmpq $16,%rbx
+ jb L$open_sse_tail_16
+ subq $16,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm1
+ movdqu %xmm1,0(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,%xmm13
+ movdqa %xmm6,%xmm2
+ movdqa %xmm10,%xmm6
+ movdqa %xmm14,%xmm10
+ jmp L$open_sse_128_xor_hash
+
+
+
+
+
+
+
+
+
+.globl _chacha20_poly1305_seal
+.private_extern _chacha20_poly1305_seal
+
+.p2align 6
+_chacha20_poly1305_seal:
+
+_CET_ENDBR
+ pushq %rbp
+
+ pushq %rbx
+
+ pushq %r12
+
+ pushq %r13
+
+ pushq %r14
+
+ pushq %r15
+
+
+
+ pushq %r9
+
+ subq $288 + 0 + 32,%rsp
+
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+
+ movq 56(%r9),%rbx
+ addq %rdx,%rbx
+ movq %r8,0+0+32(%rbp)
+ movq %rbx,8+0+32(%rbp)
+ movq %rdx,%rbx
+
+ movl _OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_seal_avx2
+
+ cmpq $128,%rbx
+ jbe L$seal_sse_128
+
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm14
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd L$sse_inc(%rip),%xmm12
+
+ movdqa %xmm4,0+48(%rbp)
+ movdqa %xmm8,0+64(%rbp)
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+ movq $10,%r10
+L$seal_sse_init_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa L$rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa L$rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa L$rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa L$rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jnz L$seal_sse_init_rounds
+ paddd L$chacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+
+ pand L$clamp(%rip),%xmm3
+ movdqa %xmm3,0+0(%rbp)
+ movdqa %xmm7,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ cmpq $192,%rbx
+ ja L$seal_sse_main_init
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ jmp L$seal_sse_128_tail_hash
+L$seal_sse_main_init:
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 128(%rdi)
+ movdqu %xmm4,16 + 128(%rdi)
+ movdqu %xmm8,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ movq $2,%rcx
+ movq $8,%r8
+ cmpq $64,%rbx
+ jbe L$seal_sse_tail_64
+ cmpq $128,%rbx
+ jbe L$seal_sse_tail_128
+ cmpq $192,%rbx
+ jbe L$seal_sse_tail_192
+
+L$seal_sse_main_loop:
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd L$sse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd L$sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+.p2align 5
+L$seal_sse_main_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa L$rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa L$rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa L$rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa L$rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ leaq 16(%rdi),%rdi
+ decq %r8
+ jge L$seal_sse_main_rounds
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg L$seal_sse_main_rounds
+ paddd L$chacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ movdqa %xmm14,0+80(%rbp)
+ movdqa %xmm14,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm14
+ pxor %xmm3,%xmm14
+ movdqu %xmm14,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm14
+ pxor %xmm7,%xmm14
+ movdqu %xmm14,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm14
+ pxor %xmm11,%xmm14
+ movdqu %xmm14,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm14
+ pxor %xmm15,%xmm14
+ movdqu %xmm14,48 + 0(%rdi)
+
+ movdqa 0+80(%rbp),%xmm14
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ cmpq $256,%rbx
+ ja L$seal_sse_main_loop_xor
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ jmp L$seal_sse_128_tail_hash
+L$seal_sse_main_loop_xor:
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ movq $6,%rcx
+ movq $4,%r8
+ cmpq $192,%rbx
+ jg L$seal_sse_main_loop
+ movq %rbx,%rcx
+ testq %rbx,%rbx
+ je L$seal_sse_128_tail_hash
+ movq $6,%rcx
+ cmpq $128,%rbx
+ ja L$seal_sse_tail_192
+ cmpq $64,%rbx
+ ja L$seal_sse_tail_128
+
+L$seal_sse_tail_64:
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa 0+96(%rbp),%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+
+L$seal_sse_tail_64_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+L$seal_sse_tail_64_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg L$seal_sse_tail_64_rounds_and_x2hash
+ decq %r8
+ jge L$seal_sse_tail_64_rounds_and_x1hash
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ jmp L$seal_sse_128_tail_xor
+
+L$seal_sse_tail_128:
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 0+96(%rbp),%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+
+L$seal_sse_tail_128_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+L$seal_sse_tail_128_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg L$seal_sse_tail_128_rounds_and_x2hash
+ decq %r8
+ jge L$seal_sse_tail_128_rounds_and_x1hash
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ movq $64,%rcx
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ jmp L$seal_sse_128_tail_hash
+
+L$seal_sse_tail_192:
+ movdqa L$chacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 0+96(%rbp),%xmm14
+ paddd L$sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+
+L$seal_sse_tail_192_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+L$seal_sse_tail_192_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg L$seal_sse_tail_192_rounds_and_x2hash
+ decq %r8
+ jge L$seal_sse_tail_192_rounds_and_x1hash
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+
+L$seal_sse_128_tail_hash:
+ cmpq $16,%rcx
+ jb L$seal_sse_128_tail_xor
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ leaq 16(%rdi),%rdi
+ jmp L$seal_sse_128_tail_hash
+
+L$seal_sse_128_tail_xor:
+ cmpq $16,%rbx
+ jb L$seal_sse_tail_16
+ subq $16,%rbx
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0(%rdi)
+
+ addq 0(%rdi),%r10
+ adcq 8(%rdi),%r11
+ adcq $1,%r12
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ movdqa %xmm1,%xmm12
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ jmp L$seal_sse_128_tail_xor
+
+L$seal_sse_tail_16:
+ testq %rbx,%rbx
+ jz L$process_blocks_of_extra_in
+
+ movq %rbx,%r8
+ movq %rbx,%rcx
+ leaq -1(%rsi,%rbx,1),%rsi
+ pxor %xmm15,%xmm15
+L$seal_sse_tail_16_compose:
+ pslldq $1,%xmm15
+ pinsrb $0,(%rsi),%xmm15
+ leaq -1(%rsi),%rsi
+ decq %rcx
+ jne L$seal_sse_tail_16_compose
+
+
+ pxor %xmm0,%xmm15
+
+
+ movq %rbx,%rcx
+ movdqu %xmm15,%xmm0
+L$seal_sse_tail_16_extract:
+ pextrb $0,%xmm0,(%rdi)
+ psrldq $1,%xmm0
+ addq $1,%rdi
+ subq $1,%rcx
+ jnz L$seal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+ movq 288 + 0 + 32(%rsp),%r9
+ movq 56(%r9),%r14
+ movq 48(%r9),%r13
+ testq %r14,%r14
+ jz L$process_partial_block
+
+ movq $16,%r15
+ subq %rbx,%r15
+ cmpq %r15,%r14
+
+ jge L$load_extra_in
+ movq %r14,%r15
+
+L$load_extra_in:
+
+
+ leaq -1(%r13,%r15,1),%rsi
+
+
+ addq %r15,%r13
+ subq %r15,%r14
+ movq %r13,48(%r9)
+ movq %r14,56(%r9)
+
+
+
+ addq %r15,%r8
+
+
+ pxor %xmm11,%xmm11
+L$load_extra_load_loop:
+ pslldq $1,%xmm11
+ pinsrb $0,(%rsi),%xmm11
+ leaq -1(%rsi),%rsi
+ subq $1,%r15
+ jnz L$load_extra_load_loop
+
+
+
+
+ movq %rbx,%r15
+
+L$load_extra_shift_loop:
+ pslldq $1,%xmm11
+ subq $1,%r15
+ jnz L$load_extra_shift_loop
+
+
+
+
+ leaq L$and_masks(%rip),%r15
+ shlq $4,%rbx
+ pand -16(%r15,%rbx,1),%xmm15
+
+
+ por %xmm11,%xmm15
+
+
+
+.byte 102,77,15,126,253
+ pextrq $1,%xmm15,%r14
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+L$process_blocks_of_extra_in:
+
+ movq 288+32+0 (%rsp),%r9
+ movq 48(%r9),%rsi
+ movq 56(%r9),%r8
+ movq %r8,%rcx
+ shrq $4,%r8
+
+L$process_extra_hash_loop:
+ jz process_extra_in_trailer
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rsi),%rsi
+ subq $1,%r8
+ jmp L$process_extra_hash_loop
+process_extra_in_trailer:
+ andq $15,%rcx
+ movq %rcx,%rbx
+ jz L$do_length_block
+ leaq -1(%rsi,%rcx,1),%rsi
+
+L$process_extra_in_trailer_load:
+ pslldq $1,%xmm15
+ pinsrb $0,(%rsi),%xmm15
+ leaq -1(%rsi),%rsi
+ subq $1,%rcx
+ jnz L$process_extra_in_trailer_load
+
+L$process_partial_block:
+
+ leaq L$and_masks(%rip),%r15
+ shlq $4,%rbx
+ pand -16(%r15,%rbx,1),%xmm15
+.byte 102,77,15,126,253
+ pextrq $1,%xmm15,%r14
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+L$do_length_block:
+ addq 0+0+32(%rbp),%r10
+ adcq 8+0+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+0+16(%rbp),%r10
+ adcq 8+0+16(%rbp),%r11
+
+
+ addq $288 + 0 + 32,%rsp
+
+
+ popq %r9
+
+ movq %r10,(%r9)
+ movq %r11,8(%r9)
+ popq %r15
+
+ popq %r14
+
+ popq %r13
+
+ popq %r12
+
+ popq %rbx
+
+ popq %rbp
+
+ ret
+
+L$seal_sse_128:
+
+ movdqu L$chacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm14
+ movdqa %xmm14,%xmm12
+ paddd L$sse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd L$sse_inc(%rip),%xmm13
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ movq $10,%r10
+
+L$seal_sse_128_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb L$rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb L$rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb L$rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz L$seal_sse_128_rounds
+ paddd L$chacha20_consts(%rip),%xmm0
+ paddd L$chacha20_consts(%rip),%xmm1
+ paddd L$chacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm8
+ paddd %xmm11,%xmm9
+ paddd %xmm15,%xmm12
+ paddd L$sse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm13
+
+ pand L$clamp(%rip),%xmm2
+ movdqa %xmm2,0+0(%rbp)
+ movdqa %xmm6,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ jmp L$seal_sse_128_tail_xor
+
+
+
+
+
+.p2align 6
+chacha20_poly1305_open_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+ vzeroupper
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd L$avx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe L$open_avx2_192
+ cmpq $320,%rbx
+ jbe L$open_avx2_320
+
+ vmovdqa %ymm4,0+64(%rbp)
+ vmovdqa %ymm8,0+96(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+ movq $10,%r10
+L$open_avx2_init_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ decq %r10
+ jne L$open_avx2_init_rounds
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand L$clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+
+ xorq %rcx,%rcx
+L$open_avx2_init_hash:
+ addq 0+0(%rsi,%rcx,1),%r10
+ adcq 8+0(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%rcx
+ cmpq $64,%rcx
+ jne L$open_avx2_init_hash
+
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm4,32(%rdi)
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ subq $64,%rbx
+L$open_avx2_main_loop:
+
+ cmpq $512,%rbx
+ jb L$open_avx2_main_loop_done
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %rcx,%rcx
+L$open_avx2_main_loop_rounds:
+ addq 0+0(%rsi,%rcx,1),%r10
+ adcq 8+0(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ addq 0+16(%rsi,%rcx,1),%r10
+ adcq 8+16(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq 0+32(%rsi,%rcx,1),%r10
+ adcq 8+32(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+
+ leaq 48(%rcx),%rcx
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ cmpq $60*8,%rcx
+ jne L$open_avx2_main_loop_rounds
+ vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ addq 0+60*8(%rsi),%r10
+ adcq 8+60*8(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ addq 0+60*8+16(%rsi),%r10
+ adcq 8+60*8+16(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ leaq 512(%rdi),%rdi
+ subq $512,%rbx
+ jmp L$open_avx2_main_loop
+L$open_avx2_main_loop_done:
+ testq %rbx,%rbx
+ vzeroupper
+ je L$open_sse_finalize
+
+ cmpq $384,%rbx
+ ja L$open_avx2_tail_512
+ cmpq $256,%rbx
+ ja L$open_avx2_tail_384
+ cmpq $128,%rbx
+ ja L$open_avx2_tail_256
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ testq %rcx,%rcx
+ je L$open_avx2_tail_128_rounds
+L$open_avx2_tail_128_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+L$open_avx2_tail_128_rounds:
+ addq $16,%r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb L$open_avx2_tail_128_rounds_and_x1hash
+ cmpq $160,%r8
+ jne L$open_avx2_tail_128_rounds
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_256:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+
+ movq %rbx,0+128(%rbp)
+ movq %rbx,%rcx
+ subq $128,%rcx
+ shrq $4,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+L$open_avx2_tail_256_rounds_and_x1hash:
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+L$open_avx2_tail_256_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+
+ incq %r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ cmpq %rcx,%r8
+ jb L$open_avx2_tail_256_rounds_and_x1hash
+ cmpq $10,%r8
+ jne L$open_avx2_tail_256_rounds
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 0+128(%rbp),%rbx
+L$open_avx2_tail_256_hash:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg L$open_avx2_tail_256_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp L$open_avx2_tail_256_hash
+L$open_avx2_tail_256_done:
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ subq $128,%rbx
+ jmp L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_384:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+
+ movq %rbx,0+128(%rbp)
+ movq %rbx,%rcx
+ subq $256,%rcx
+ shrq $4,%rcx
+ addq $6,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+L$open_avx2_tail_384_rounds_and_x2hash:
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+L$open_avx2_tail_384_rounds_and_x1hash:
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+ incq %r8
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb L$open_avx2_tail_384_rounds_and_x2hash
+ cmpq $10,%r8
+ jne L$open_avx2_tail_384_rounds_and_x1hash
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 0+128(%rbp),%rbx
+L$open_avx2_384_tail_hash:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg L$open_avx2_384_tail_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp L$open_avx2_384_tail_hash
+L$open_avx2_384_tail_done:
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp L$open_avx2_tail_128_xor
+
+L$open_avx2_tail_512:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %rcx,%rcx
+ movq %rsi,%r8
+L$open_avx2_tail_512_rounds_and_x2hash:
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+L$open_avx2_tail_512_rounds_and_x1hash:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ addq 0+16(%r8),%r10
+ adcq 8+16(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%r8),%r8
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ incq %rcx
+ cmpq $4,%rcx
+ jl L$open_avx2_tail_512_rounds_and_x2hash
+ cmpq $10,%rcx
+ jne L$open_avx2_tail_512_rounds_and_x1hash
+ movq %rbx,%rcx
+ subq $384,%rcx
+ andq $-16,%rcx
+L$open_avx2_tail_512_hash:
+ testq %rcx,%rcx
+ je L$open_avx2_tail_512_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ subq $16,%rcx
+ jmp L$open_avx2_tail_512_hash
+L$open_avx2_tail_512_done:
+ vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 384(%rsi),%rsi
+ leaq 384(%rdi),%rdi
+ subq $384,%rbx
+L$open_avx2_tail_128_xor:
+ cmpq $32,%rbx
+ jb L$open_avx2_tail_32_xor
+ subq $32,%rbx
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ jmp L$open_avx2_tail_128_xor
+L$open_avx2_tail_32_xor:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb L$open_avx2_exit
+ subq $16,%rbx
+
+ vpxor (%rsi),%xmm0,%xmm1
+ vmovdqu %xmm1,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
+ vmovdqa %xmm0,%xmm1
+L$open_avx2_exit:
+ vzeroupper
+ jmp L$open_sse_tail_16
+
+L$open_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+L$open_avx2_192_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne L$open_avx2_192_rounds
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand L$clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+L$open_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+L$open_avx2_short_hash_and_xor_loop:
+ cmpq $32,%rbx
+ jb L$open_avx2_short_tail_32
+ subq $32,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rsi),%r10
+ adcq 8+16(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp L$open_avx2_short_hash_and_xor_loop
+L$open_avx2_short_tail_32:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb L$open_avx2_short_tail_32_exit
+ subq $16,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm1
+L$open_avx2_short_tail_32_exit:
+ vzeroupper
+ jmp L$open_sse_tail_16
+
+L$open_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
+ vpaddd L$avx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ movq $10,%r10
+L$open_avx2_320_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne L$open_avx2_320_rounds
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand L$clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp L$open_avx2_short
+
+
+
+
+
+.p2align 6
+chacha20_poly1305_seal_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+ vzeroupper
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd L$avx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe L$seal_avx2_192
+ cmpq $320,%rbx
+ jbe L$seal_avx2_320
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm4,0+64(%rbp)
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm8,0+96(%rbp)
+ vmovdqa %ymm12,%ymm15
+ vpaddd L$avx2_inc(%rip),%ymm15,%ymm14
+ vpaddd L$avx2_inc(%rip),%ymm14,%ymm13
+ vpaddd L$avx2_inc(%rip),%ymm13,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm15,0+256(%rbp)
+ movq $10,%r10
+L$seal_avx2_init_rounds:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %r10
+ jnz L$seal_avx2_init_rounds
+ vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
+ vpand L$clamp(%rip),%ymm15,%ymm15
+ vmovdqa %ymm15,0+0(%rbp)
+ movq %r8,%r8
+ call poly_hash_ad_internal
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+64(%rsi),%ymm15,%ymm15
+ vpxor 32+64(%rsi),%ymm2,%ymm2
+ vpxor 64+64(%rsi),%ymm6,%ymm6
+ vpxor 96+64(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm15,0+64(%rdi)
+ vmovdqu %ymm2,32+64(%rdi)
+ vmovdqu %ymm6,64+64(%rdi)
+ vmovdqu %ymm10,96+64(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+192(%rsi),%ymm15,%ymm15
+ vpxor 32+192(%rsi),%ymm1,%ymm1
+ vpxor 64+192(%rsi),%ymm5,%ymm5
+ vpxor 96+192(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm15,0+192(%rdi)
+ vmovdqu %ymm1,32+192(%rdi)
+ vmovdqu %ymm5,64+192(%rdi)
+ vmovdqu %ymm9,96+192(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm15,%ymm8
+
+ leaq 320(%rsi),%rsi
+ subq $320,%rbx
+ movq $320,%rcx
+ cmpq $128,%rbx
+ jbe L$seal_avx2_short_hash_remainder
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+ vpxor 64(%rsi),%ymm8,%ymm8
+ vpxor 96(%rsi),%ymm12,%ymm12
+ vmovdqu %ymm0,320(%rdi)
+ vmovdqu %ymm4,352(%rdi)
+ vmovdqu %ymm8,384(%rdi)
+ vmovdqu %ymm12,416(%rdi)
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ movq $8,%rcx
+ movq $2,%r8
+ cmpq $128,%rbx
+ jbe L$seal_avx2_tail_128
+ cmpq $256,%rbx
+ jbe L$seal_avx2_tail_256
+ cmpq $384,%rbx
+ jbe L$seal_avx2_tail_384
+ cmpq $512,%rbx
+ jbe L$seal_avx2_tail_512
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+
+ subq $16,%rdi
+ movq $9,%rcx
+ jmp L$seal_avx2_main_loop_rounds_entry
+.p2align 5
+L$seal_avx2_main_loop:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ movq $10,%rcx
+.p2align 5
+L$seal_avx2_main_loop_rounds:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+L$seal_avx2_main_loop_rounds_entry:
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq 0+32(%rdi),%r10
+ adcq 8+32(%rdi),%r11
+ adcq $1,%r12
+
+ leaq 48(%rdi),%rdi
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %rcx
+ jne L$seal_avx2_main_loop_rounds
+ vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ subq $512,%rbx
+ cmpq $512,%rbx
+ jg L$seal_avx2_main_loop
+
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ movq $10,%rcx
+ xorq %r8,%r8
+
+ cmpq $384,%rbx
+ ja L$seal_avx2_tail_512
+ cmpq $256,%rbx
+ ja L$seal_avx2_tail_384
+ cmpq $128,%rbx
+ ja L$seal_avx2_tail_256
+
+L$seal_avx2_tail_128:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+
+L$seal_avx2_tail_128_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+L$seal_avx2_tail_128_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg L$seal_avx2_tail_128_rounds_and_3xhash
+ decq %r8
+ jge L$seal_avx2_tail_128_rounds_and_2xhash
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp L$seal_avx2_short_loop
+
+L$seal_avx2_tail_256:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+
+L$seal_avx2_tail_256_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+L$seal_avx2_tail_256_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg L$seal_avx2_tail_256_rounds_and_3xhash
+ decq %r8
+ jge L$seal_avx2_tail_256_rounds_and_2xhash
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $128,%rcx
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ jmp L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_tail_384:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+
+L$seal_avx2_tail_384_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+L$seal_avx2_tail_384_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg L$seal_avx2_tail_384_rounds_and_3xhash
+ decq %r8
+ jge L$seal_avx2_tail_384_rounds_and_2xhash
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $256,%rcx
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ jmp L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_tail_512:
+ vmovdqa L$chacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa L$avx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+L$seal_avx2_tail_512_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+L$seal_avx2_tail_512_rounds_and_2xhash:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa L$rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa L$rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ addq %rax,%r15
+ adcq %rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg L$seal_avx2_tail_512_rounds_and_3xhash
+ decq %r8
+ jge L$seal_avx2_tail_512_rounds_and_2xhash
+ vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $384,%rcx
+ leaq 384(%rsi),%rsi
+ subq $384,%rbx
+ jmp L$seal_avx2_short_hash_remainder
+
+L$seal_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
+ vpaddd L$avx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ movq $10,%r10
+L$seal_avx2_320_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb L$rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne L$seal_avx2_320_rounds
+ vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand L$clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp L$seal_avx2_short
+
+L$seal_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd L$avx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+L$seal_avx2_192_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb L$rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb L$rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne L$seal_avx2_192_rounds
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand L$clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+L$seal_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ xorq %rcx,%rcx
+L$seal_avx2_short_hash_remainder:
+ cmpq $16,%rcx
+ jb L$seal_avx2_short_loop
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ addq $16,%rdi
+ jmp L$seal_avx2_short_hash_remainder
+L$seal_avx2_short_loop:
+ cmpq $32,%rbx
+ jb L$seal_avx2_short_tail
+ subq $32,%rbx
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp L$seal_avx2_short_loop
+L$seal_avx2_short_tail:
+ cmpq $16,%rbx
+ jb L$seal_avx2_exit
+ subq $16,%rbx
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm0
+L$seal_avx2_exit:
+ vzeroupper
+ jmp L$seal_sse_tail_16
+
+
+#endif
diff --git a/gen/crypto/chacha20_poly1305_x86_64-linux.S b/gen/crypto/chacha20_poly1305_x86_64-linux.S
new file mode 100644
index 0000000..ac38f8f
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_x86_64-linux.S
@@ -0,0 +1,8918 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+chacha20_poly1305_constants:
+
+.section .rodata
+.align 64
+.Lchacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Lrol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lrol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.Lavx2_init:
+.long 0,0,0,0
+.Lsse_inc:
+.long 1,0,0,0
+.Lavx2_inc:
+.long 2,0,0,0,2,0,0,0
+.Lclamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align 16
+.Land_masks:
+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+.text
+
+.type poly_hash_ad_internal,@function
+.align 64
+poly_hash_ad_internal:
+.cfi_startproc
+.cfi_def_cfa rsp, 8
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ cmpq $13,%r8
+ jne .Lhash_ad_loop
+.Lpoly_fast_tls_ad:
+
+ movq (%rcx),%r10
+ movq 5(%rcx),%r11
+ shrq $24,%r11
+ movq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ ret
+.Lhash_ad_loop:
+
+ cmpq $16,%r8
+ jb .Lhash_ad_tail
+ addq 0+0(%rcx),%r10
+ adcq 8+0(%rcx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rcx),%rcx
+ subq $16,%r8
+ jmp .Lhash_ad_loop
+.Lhash_ad_tail:
+ cmpq $0,%r8
+ je .Lhash_ad_done
+
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ addq %r8,%rcx
+.Lhash_ad_tail_loop:
+ shldq $8,%r13,%r14
+ shlq $8,%r13
+ movzbq -1(%rcx),%r15
+ xorq %r15,%r13
+ decq %rcx
+ decq %r8
+ jne .Lhash_ad_tail_loop
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Lhash_ad_done:
+ ret
+.cfi_endproc
+.size poly_hash_ad_internal, .-poly_hash_ad_internal
+
+.globl chacha20_poly1305_open
+.hidden chacha20_poly1305_open
+.type chacha20_poly1305_open,@function
+.align 64
+chacha20_poly1305_open:
+.cfi_startproc
+_CET_ENDBR
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+
+ pushq %r9
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+ subq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset 288 + 32
+
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+
+ movq %rdx,%rbx
+ movq %r8,0+0+32(%rbp)
+ movq %rbx,8+0+32(%rbp)
+
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_open_avx2
+
+ cmpq $128,%rbx
+ jbe .Lopen_sse_128
+
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+
+ movdqa %xmm12,%xmm7
+
+ movdqa %xmm4,0+48(%rbp)
+ movdqa %xmm8,0+64(%rbp)
+ movdqa %xmm12,0+96(%rbp)
+ movq $10,%r10
+.Lopen_sse_init_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jne .Lopen_sse_init_rounds
+
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+
+ pand .Lclamp(%rip),%xmm0
+ movdqa %xmm0,0+0(%rbp)
+ movdqa %xmm4,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+.Lopen_sse_main_loop:
+ cmpq $256,%rbx
+ jb .Lopen_sse_tail
+
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd .Lsse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+
+
+ movq $4,%rcx
+ movq %rsi,%r8
+.Lopen_sse_main_loop_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+
+ leaq 16(%r8),%r8
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %rcx
+ jge .Lopen_sse_main_loop_rounds
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ cmpq $-6,%rcx
+ jg .Lopen_sse_main_loop_rounds
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqa %xmm12,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor 0+80(%rbp),%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp .Lopen_sse_main_loop
+.Lopen_sse_tail:
+
+ testq %rbx,%rbx
+ jz .Lopen_sse_finalize
+ cmpq $192,%rbx
+ ja .Lopen_sse_tail_256
+ cmpq $128,%rbx
+ ja .Lopen_sse_tail_192
+ cmpq $64,%rbx
+ ja .Lopen_sse_tail_128
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa 0+96(%rbp),%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ cmpq $16,%rcx
+ jb .Lopen_sse_tail_64_rounds
+.Lopen_sse_tail_64_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+.Lopen_sse_tail_64_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ cmpq $16,%rcx
+ jae .Lopen_sse_tail_64_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_sse_tail_64_rounds
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ jmp .Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_128:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 0+96(%rbp),%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+.Lopen_sse_tail_128_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_sse_tail_128_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ cmpq %rcx,%r8
+ jb .Lopen_sse_tail_128_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_sse_tail_128_rounds
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jmp .Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_192:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 0+96(%rbp),%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+
+ movq %rbx,%rcx
+ movq $160,%r8
+ cmpq $160,%rcx
+ cmovgq %r8,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+.Lopen_sse_tail_192_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_sse_tail_192_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ cmpq %rcx,%r8
+ jb .Lopen_sse_tail_192_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_sse_tail_192_rounds
+ cmpq $176,%rbx
+ jb .Lopen_sse_tail_192_finish
+ addq 0+160(%rsi),%r10
+ adcq 8+160(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ cmpq $192,%rbx
+ jb .Lopen_sse_tail_192_finish
+ addq 0+176(%rsi),%r10
+ adcq 8+176(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_sse_tail_192_finish:
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ jmp .Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_256:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd .Lsse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+ xorq %r8,%r8
+.Lopen_sse_tail_256_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movdqa %xmm11,0+80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ movdqa 0+80(%rbp),%xmm11
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa %xmm9,0+80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+ movdqa 0+80(%rbp),%xmm9
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ movdqa %xmm11,0+80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+ movdqa 0+80(%rbp),%xmm11
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ movdqa %xmm9,0+80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+ movdqa 0+80(%rbp),%xmm9
+
+ addq $16,%r8
+ cmpq $160,%r8
+ jb .Lopen_sse_tail_256_rounds_and_x1hash
+
+ movq %rbx,%rcx
+ andq $-16,%rcx
+.Lopen_sse_tail_256_hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%r8
+ cmpq %rcx,%r8
+ jb .Lopen_sse_tail_256_hash
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqa %xmm12,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movdqa 0+80(%rbp),%xmm12
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ leaq 192(%rdi),%rdi
+
+
+.Lopen_sse_tail_64_dec_loop:
+ cmpq $16,%rbx
+ jb .Lopen_sse_tail_16_init
+ subq $16,%rbx
+ movdqu (%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ jmp .Lopen_sse_tail_64_dec_loop
+.Lopen_sse_tail_16_init:
+ movdqa %xmm0,%xmm1
+
+
+.Lopen_sse_tail_16:
+ testq %rbx,%rbx
+ jz .Lopen_sse_finalize
+
+
+
+ pxor %xmm3,%xmm3
+ leaq -1(%rsi,%rbx,1),%rsi
+ movq %rbx,%r8
+.Lopen_sse_tail_16_compose:
+ pslldq $1,%xmm3
+ pinsrb $0,(%rsi),%xmm3
+ subq $1,%rsi
+ subq $1,%r8
+ jnz .Lopen_sse_tail_16_compose
+
+.byte 102,73,15,126,221
+ pextrq $1,%xmm3,%r14
+
+ pxor %xmm1,%xmm3
+
+
+.Lopen_sse_tail_16_extract:
+ pextrb $0,%xmm3,(%rdi)
+ psrldq $1,%xmm3
+ addq $1,%rdi
+ subq $1,%rbx
+ jne .Lopen_sse_tail_16_extract
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Lopen_sse_finalize:
+ addq 0+0+32(%rbp),%r10
+ adcq 8+0+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+0+16(%rbp),%r10
+ adcq 8+0+16(%rbp),%r11
+
+.cfi_remember_state
+ addq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+
+ popq %r9
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r9
+ movq %r10,(%r9)
+ movq %r11,8(%r9)
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ ret
+
+.Lopen_sse_128:
+.cfi_restore_state
+ movdqu .Lchacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm13,%xmm15
+ movq $10,%r10
+
+.Lopen_sse_128_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz .Lopen_sse_128_rounds
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm9
+ paddd %xmm11,%xmm10
+ paddd %xmm15,%xmm13
+ paddd .Lsse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm14
+
+ pand .Lclamp(%rip),%xmm0
+ movdqa %xmm0,0+0(%rbp)
+ movdqa %xmm4,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+.Lopen_sse_128_xor_hash:
+ cmpq $16,%rbx
+ jb .Lopen_sse_tail_16
+ subq $16,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm1
+ movdqu %xmm1,0(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,%xmm13
+ movdqa %xmm6,%xmm2
+ movdqa %xmm10,%xmm6
+ movdqa %xmm14,%xmm10
+ jmp .Lopen_sse_128_xor_hash
+.size chacha20_poly1305_open, .-chacha20_poly1305_open
+.cfi_endproc
+
+
+
+
+
+
+
+.globl chacha20_poly1305_seal
+.hidden chacha20_poly1305_seal
+.type chacha20_poly1305_seal,@function
+.align 64
+chacha20_poly1305_seal:
+.cfi_startproc
+_CET_ENDBR
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+
+ pushq %r9
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+ subq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset 288 + 32
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+
+ movq 56(%r9),%rbx
+ addq %rdx,%rbx
+ movq %r8,0+0+32(%rbp)
+ movq %rbx,8+0+32(%rbp)
+ movq %rdx,%rbx
+
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_seal_avx2
+
+ cmpq $128,%rbx
+ jbe .Lseal_sse_128
+
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm14
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .Lsse_inc(%rip),%xmm12
+
+ movdqa %xmm4,0+48(%rbp)
+ movdqa %xmm8,0+64(%rbp)
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+ movq $10,%r10
+.Lseal_sse_init_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jnz .Lseal_sse_init_rounds
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+
+ pand .Lclamp(%rip),%xmm3
+ movdqa %xmm3,0+0(%rbp)
+ movdqa %xmm7,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ cmpq $192,%rbx
+ ja .Lseal_sse_main_init
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_init:
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 128(%rdi)
+ movdqu %xmm4,16 + 128(%rdi)
+ movdqu %xmm8,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ movq $2,%rcx
+ movq $8,%r8
+ cmpq $64,%rbx
+ jbe .Lseal_sse_tail_64
+ cmpq $128,%rbx
+ jbe .Lseal_sse_tail_128
+ cmpq $192,%rbx
+ jbe .Lseal_sse_tail_192
+
+.Lseal_sse_main_loop:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd .Lsse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+.align 32
+.Lseal_sse_main_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ leaq 16(%rdi),%rdi
+ decq %r8
+ jge .Lseal_sse_main_rounds
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_main_rounds
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ movdqa %xmm14,0+80(%rbp)
+ movdqa %xmm14,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm14
+ pxor %xmm3,%xmm14
+ movdqu %xmm14,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm14
+ pxor %xmm7,%xmm14
+ movdqu %xmm14,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm14
+ pxor %xmm11,%xmm14
+ movdqu %xmm14,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm14
+ pxor %xmm15,%xmm14
+ movdqu %xmm14,48 + 0(%rdi)
+
+ movdqa 0+80(%rbp),%xmm14
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ cmpq $256,%rbx
+ ja .Lseal_sse_main_loop_xor
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_loop_xor:
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ movq $6,%rcx
+ movq $4,%r8
+ cmpq $192,%rbx
+ jg .Lseal_sse_main_loop
+ movq %rbx,%rcx
+ testq %rbx,%rbx
+ je .Lseal_sse_128_tail_hash
+ movq $6,%rcx
+ cmpq $128,%rbx
+ ja .Lseal_sse_tail_192
+ cmpq $64,%rbx
+ ja .Lseal_sse_tail_128
+
+.Lseal_sse_tail_64:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa 0+96(%rbp),%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+
+.Lseal_sse_tail_64_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_sse_tail_64_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_tail_64_rounds_and_x2hash
+ decq %r8
+ jge .Lseal_sse_tail_64_rounds_and_x1hash
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ jmp .Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_128:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 0+96(%rbp),%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+
+.Lseal_sse_tail_128_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_sse_tail_128_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_tail_128_rounds_and_x2hash
+ decq %r8
+ jge .Lseal_sse_tail_128_rounds_and_x1hash
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ movq $64,%rcx
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ jmp .Lseal_sse_128_tail_hash
+
+.Lseal_sse_tail_192:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 0+96(%rbp),%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+
+.Lseal_sse_tail_192_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_sse_tail_192_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_tail_192_rounds_and_x2hash
+ decq %r8
+ jge .Lseal_sse_tail_192_rounds_and_x1hash
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+
+.Lseal_sse_128_tail_hash:
+ cmpq $16,%rcx
+ jb .Lseal_sse_128_tail_xor
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ leaq 16(%rdi),%rdi
+ jmp .Lseal_sse_128_tail_hash
+
+.Lseal_sse_128_tail_xor:
+ cmpq $16,%rbx
+ jb .Lseal_sse_tail_16
+ subq $16,%rbx
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0(%rdi)
+
+ addq 0(%rdi),%r10
+ adcq 8(%rdi),%r11
+ adcq $1,%r12
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ movdqa %xmm1,%xmm12
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ jmp .Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_16:
+ testq %rbx,%rbx
+ jz .Lprocess_blocks_of_extra_in
+
+ movq %rbx,%r8
+ movq %rbx,%rcx
+ leaq -1(%rsi,%rbx,1),%rsi
+ pxor %xmm15,%xmm15
+.Lseal_sse_tail_16_compose:
+ pslldq $1,%xmm15
+ pinsrb $0,(%rsi),%xmm15
+ leaq -1(%rsi),%rsi
+ decq %rcx
+ jne .Lseal_sse_tail_16_compose
+
+
+ pxor %xmm0,%xmm15
+
+
+ movq %rbx,%rcx
+ movdqu %xmm15,%xmm0
+.Lseal_sse_tail_16_extract:
+ pextrb $0,%xmm0,(%rdi)
+ psrldq $1,%xmm0
+ addq $1,%rdi
+ subq $1,%rcx
+ jnz .Lseal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+ movq 288 + 0 + 32(%rsp),%r9
+ movq 56(%r9),%r14
+ movq 48(%r9),%r13
+ testq %r14,%r14
+ jz .Lprocess_partial_block
+
+ movq $16,%r15
+ subq %rbx,%r15
+ cmpq %r15,%r14
+
+ jge .Lload_extra_in
+ movq %r14,%r15
+
+.Lload_extra_in:
+
+
+ leaq -1(%r13,%r15,1),%rsi
+
+
+ addq %r15,%r13
+ subq %r15,%r14
+ movq %r13,48(%r9)
+ movq %r14,56(%r9)
+
+
+
+ addq %r15,%r8
+
+
+ pxor %xmm11,%xmm11
+.Lload_extra_load_loop:
+ pslldq $1,%xmm11
+ pinsrb $0,(%rsi),%xmm11
+ leaq -1(%rsi),%rsi
+ subq $1,%r15
+ jnz .Lload_extra_load_loop
+
+
+
+
+ movq %rbx,%r15
+
+.Lload_extra_shift_loop:
+ pslldq $1,%xmm11
+ subq $1,%r15
+ jnz .Lload_extra_shift_loop
+
+
+
+
+ leaq .Land_masks(%rip),%r15
+ shlq $4,%rbx
+ pand -16(%r15,%rbx,1),%xmm15
+
+
+ por %xmm11,%xmm15
+
+
+
+.byte 102,77,15,126,253
+ pextrq $1,%xmm15,%r14
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Lprocess_blocks_of_extra_in:
+
+ movq 288+32+0 (%rsp),%r9
+ movq 48(%r9),%rsi
+ movq 56(%r9),%r8
+ movq %r8,%rcx
+ shrq $4,%r8
+
+.Lprocess_extra_hash_loop:
+ jz process_extra_in_trailer
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rsi),%rsi
+ subq $1,%r8
+ jmp .Lprocess_extra_hash_loop
+process_extra_in_trailer:
+ andq $15,%rcx
+ movq %rcx,%rbx
+ jz .Ldo_length_block
+ leaq -1(%rsi,%rcx,1),%rsi
+
+.Lprocess_extra_in_trailer_load:
+ pslldq $1,%xmm15
+ pinsrb $0,(%rsi),%xmm15
+ leaq -1(%rsi),%rsi
+ subq $1,%rcx
+ jnz .Lprocess_extra_in_trailer_load
+
+.Lprocess_partial_block:
+
+ leaq .Land_masks(%rip),%r15
+ shlq $4,%rbx
+ pand -16(%r15,%rbx,1),%xmm15
+.byte 102,77,15,126,253
+ pextrq $1,%xmm15,%r14
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Ldo_length_block:
+ addq 0+0+32(%rbp),%r10
+ adcq 8+0+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+0+16(%rbp),%r10
+ adcq 8+0+16(%rbp),%r11
+
+.cfi_remember_state
+ addq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+
+ popq %r9
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r9
+ movq %r10,(%r9)
+ movq %r11,8(%r9)
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ ret
+
+.Lseal_sse_128:
+.cfi_restore_state
+ movdqu .Lchacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm14
+ movdqa %xmm14,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ movq $10,%r10
+
+.Lseal_sse_128_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz .Lseal_sse_128_rounds
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm8
+ paddd %xmm11,%xmm9
+ paddd %xmm15,%xmm12
+ paddd .Lsse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm13
+
+ pand .Lclamp(%rip),%xmm2
+ movdqa %xmm2,0+0(%rbp)
+ movdqa %xmm6,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ jmp .Lseal_sse_128_tail_xor
+.size chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.cfi_endproc
+
+
+.type chacha20_poly1305_open_avx2,@function
+.align 64
+chacha20_poly1305_open_avx2:
+.cfi_startproc
+
+
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+.cfi_adjust_cfa_offset 288 + 32
+
+ vzeroupper
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd .Lavx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe .Lopen_avx2_192
+ cmpq $320,%rbx
+ jbe .Lopen_avx2_320
+
+ vmovdqa %ymm4,0+64(%rbp)
+ vmovdqa %ymm8,0+96(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+ movq $10,%r10
+.Lopen_avx2_init_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ decq %r10
+ jne .Lopen_avx2_init_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+
+ xorq %rcx,%rcx
+.Lopen_avx2_init_hash:
+ addq 0+0(%rsi,%rcx,1),%r10
+ adcq 8+0(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%rcx
+ cmpq $64,%rcx
+ jne .Lopen_avx2_init_hash
+
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm4,32(%rdi)
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ subq $64,%rbx
+.Lopen_avx2_main_loop:
+
+ cmpq $512,%rbx
+ jb .Lopen_avx2_main_loop_done
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %rcx,%rcx
+.Lopen_avx2_main_loop_rounds:
+ addq 0+0(%rsi,%rcx,1),%r10
+ adcq 8+0(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ addq 0+16(%rsi,%rcx,1),%r10
+ adcq 8+16(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq 0+32(%rsi,%rcx,1),%r10
+ adcq 8+32(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+
+ leaq 48(%rcx),%rcx
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ cmpq $60*8,%rcx
+ jne .Lopen_avx2_main_loop_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ addq 0+60*8(%rsi),%r10
+ adcq 8+60*8(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ addq 0+60*8+16(%rsi),%r10
+ adcq 8+60*8+16(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ leaq 512(%rdi),%rdi
+ subq $512,%rbx
+ jmp .Lopen_avx2_main_loop
+.Lopen_avx2_main_loop_done:
+ testq %rbx,%rbx
+ vzeroupper
+ je .Lopen_sse_finalize
+
+ cmpq $384,%rbx
+ ja .Lopen_avx2_tail_512
+ cmpq $256,%rbx
+ ja .Lopen_avx2_tail_384
+ cmpq $128,%rbx
+ ja .Lopen_avx2_tail_256
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ testq %rcx,%rcx
+ je .Lopen_avx2_tail_128_rounds
+.Lopen_avx2_tail_128_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_avx2_tail_128_rounds:
+ addq $16,%r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb .Lopen_avx2_tail_128_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_avx2_tail_128_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp .Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_256:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+
+ movq %rbx,0+128(%rbp)
+ movq %rbx,%rcx
+ subq $128,%rcx
+ shrq $4,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+.Lopen_avx2_tail_256_rounds_and_x1hash:
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+.Lopen_avx2_tail_256_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+
+ incq %r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ cmpq %rcx,%r8
+ jb .Lopen_avx2_tail_256_rounds_and_x1hash
+ cmpq $10,%r8
+ jne .Lopen_avx2_tail_256_rounds
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 0+128(%rbp),%rbx
+.Lopen_avx2_tail_256_hash:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg .Lopen_avx2_tail_256_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp .Lopen_avx2_tail_256_hash
+.Lopen_avx2_tail_256_done:
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ subq $128,%rbx
+ jmp .Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_384:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+
+ movq %rbx,0+128(%rbp)
+ movq %rbx,%rcx
+ subq $256,%rcx
+ shrq $4,%rcx
+ addq $6,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+.Lopen_avx2_tail_384_rounds_and_x2hash:
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+.Lopen_avx2_tail_384_rounds_and_x1hash:
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+ incq %r8
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb .Lopen_avx2_tail_384_rounds_and_x2hash
+ cmpq $10,%r8
+ jne .Lopen_avx2_tail_384_rounds_and_x1hash
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 0+128(%rbp),%rbx
+.Lopen_avx2_384_tail_hash:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg .Lopen_avx2_384_tail_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp .Lopen_avx2_384_tail_hash
+.Lopen_avx2_384_tail_done:
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp .Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_512:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %rcx,%rcx
+ movq %rsi,%r8
+.Lopen_avx2_tail_512_rounds_and_x2hash:
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+.Lopen_avx2_tail_512_rounds_and_x1hash:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ addq 0+16(%r8),%r10
+ adcq 8+16(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%r8),%r8
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ incq %rcx
+ cmpq $4,%rcx
+ jl .Lopen_avx2_tail_512_rounds_and_x2hash
+ cmpq $10,%rcx
+ jne .Lopen_avx2_tail_512_rounds_and_x1hash
+ movq %rbx,%rcx
+ subq $384,%rcx
+ andq $-16,%rcx
+.Lopen_avx2_tail_512_hash:
+ testq %rcx,%rcx
+ je .Lopen_avx2_tail_512_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ subq $16,%rcx
+ jmp .Lopen_avx2_tail_512_hash
+.Lopen_avx2_tail_512_done:
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 384(%rsi),%rsi
+ leaq 384(%rdi),%rdi
+ subq $384,%rbx
+.Lopen_avx2_tail_128_xor:
+ cmpq $32,%rbx
+ jb .Lopen_avx2_tail_32_xor
+ subq $32,%rbx
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ jmp .Lopen_avx2_tail_128_xor
+.Lopen_avx2_tail_32_xor:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb .Lopen_avx2_exit
+ subq $16,%rbx
+
+ vpxor (%rsi),%xmm0,%xmm1
+ vmovdqu %xmm1,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
+ vmovdqa %xmm0,%xmm1
+.Lopen_avx2_exit:
+ vzeroupper
+ jmp .Lopen_sse_tail_16
+
+.Lopen_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+.Lopen_avx2_192_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne .Lopen_avx2_192_rounds
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+.Lopen_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+.Lopen_avx2_short_hash_and_xor_loop:
+ cmpq $32,%rbx
+ jb .Lopen_avx2_short_tail_32
+ subq $32,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rsi),%r10
+ adcq 8+16(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp .Lopen_avx2_short_hash_and_xor_loop
+.Lopen_avx2_short_tail_32:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb .Lopen_avx2_short_tail_32_exit
+ subq $16,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm1
+.Lopen_avx2_short_tail_32_exit:
+ vzeroupper
+ jmp .Lopen_sse_tail_16
+
+.Lopen_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ movq $10,%r10
+.Lopen_avx2_320_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne .Lopen_avx2_320_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp .Lopen_avx2_short
+.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+.cfi_endproc
+
+
+.type chacha20_poly1305_seal_avx2,@function
+.align 64
+chacha20_poly1305_seal_avx2:
+.cfi_startproc
+
+
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+.cfi_adjust_cfa_offset 288 + 32
+
+ vzeroupper
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd .Lavx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe .Lseal_avx2_192
+ cmpq $320,%rbx
+ jbe .Lseal_avx2_320
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm4,0+64(%rbp)
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm8,0+96(%rbp)
+ vmovdqa %ymm12,%ymm15
+ vpaddd .Lavx2_inc(%rip),%ymm15,%ymm14
+ vpaddd .Lavx2_inc(%rip),%ymm14,%ymm13
+ vpaddd .Lavx2_inc(%rip),%ymm13,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm15,0+256(%rbp)
+ movq $10,%r10
+.Lseal_avx2_init_rounds:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %r10
+ jnz .Lseal_avx2_init_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
+ vpand .Lclamp(%rip),%ymm15,%ymm15
+ vmovdqa %ymm15,0+0(%rbp)
+ movq %r8,%r8
+ call poly_hash_ad_internal
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+64(%rsi),%ymm15,%ymm15
+ vpxor 32+64(%rsi),%ymm2,%ymm2
+ vpxor 64+64(%rsi),%ymm6,%ymm6
+ vpxor 96+64(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm15,0+64(%rdi)
+ vmovdqu %ymm2,32+64(%rdi)
+ vmovdqu %ymm6,64+64(%rdi)
+ vmovdqu %ymm10,96+64(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+192(%rsi),%ymm15,%ymm15
+ vpxor 32+192(%rsi),%ymm1,%ymm1
+ vpxor 64+192(%rsi),%ymm5,%ymm5
+ vpxor 96+192(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm15,0+192(%rdi)
+ vmovdqu %ymm1,32+192(%rdi)
+ vmovdqu %ymm5,64+192(%rdi)
+ vmovdqu %ymm9,96+192(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm15,%ymm8
+
+ leaq 320(%rsi),%rsi
+ subq $320,%rbx
+ movq $320,%rcx
+ cmpq $128,%rbx
+ jbe .Lseal_avx2_short_hash_remainder
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+ vpxor 64(%rsi),%ymm8,%ymm8
+ vpxor 96(%rsi),%ymm12,%ymm12
+ vmovdqu %ymm0,320(%rdi)
+ vmovdqu %ymm4,352(%rdi)
+ vmovdqu %ymm8,384(%rdi)
+ vmovdqu %ymm12,416(%rdi)
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ movq $8,%rcx
+ movq $2,%r8
+ cmpq $128,%rbx
+ jbe .Lseal_avx2_tail_128
+ cmpq $256,%rbx
+ jbe .Lseal_avx2_tail_256
+ cmpq $384,%rbx
+ jbe .Lseal_avx2_tail_384
+ cmpq $512,%rbx
+ jbe .Lseal_avx2_tail_512
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+
+ subq $16,%rdi
+ movq $9,%rcx
+ jmp .Lseal_avx2_main_loop_rounds_entry
+.align 32
+.Lseal_avx2_main_loop:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ movq $10,%rcx
+.align 32
+.Lseal_avx2_main_loop_rounds:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lseal_avx2_main_loop_rounds_entry:
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq 0+32(%rdi),%r10
+ adcq 8+32(%rdi),%r11
+ adcq $1,%r12
+
+ leaq 48(%rdi),%rdi
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %rcx
+ jne .Lseal_avx2_main_loop_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ subq $512,%rbx
+ cmpq $512,%rbx
+ jg .Lseal_avx2_main_loop
+
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ movq $10,%rcx
+ xorq %r8,%r8
+
+ cmpq $384,%rbx
+ ja .Lseal_avx2_tail_512
+ cmpq $256,%rbx
+ ja .Lseal_avx2_tail_384
+ cmpq $128,%rbx
+ ja .Lseal_avx2_tail_256
+
+.Lseal_avx2_tail_128:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_128_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_128_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_128_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_128_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp .Lseal_avx2_short_loop
+
+.Lseal_avx2_tail_256:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+
+.Lseal_avx2_tail_256_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_256_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_256_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_256_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $128,%rcx
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ jmp .Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_384:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+
+.Lseal_avx2_tail_384_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_384_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_384_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_384_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $256,%rcx
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ jmp .Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_512:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_512_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_512_rounds_and_2xhash:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ addq %rax,%r15
+ adcq %rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_512_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_512_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $384,%rcx
+ leaq 384(%rsi),%rsi
+ subq $384,%rbx
+ jmp .Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ movq $10,%r10
+.Lseal_avx2_320_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne .Lseal_avx2_320_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp .Lseal_avx2_short
+
+.Lseal_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+.Lseal_avx2_192_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne .Lseal_avx2_192_rounds
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+.Lseal_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ xorq %rcx,%rcx
+.Lseal_avx2_short_hash_remainder:
+ cmpq $16,%rcx
+ jb .Lseal_avx2_short_loop
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ addq $16,%rdi
+ jmp .Lseal_avx2_short_hash_remainder
+.Lseal_avx2_short_loop:
+ cmpq $32,%rbx
+ jb .Lseal_avx2_short_tail
+ subq $32,%rbx
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp .Lseal_avx2_short_loop
+.Lseal_avx2_short_tail:
+ cmpq $16,%rbx
+ jb .Lseal_avx2_exit
+ subq $16,%rbx
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm0
+.Lseal_avx2_exit:
+ vzeroupper
+ jmp .Lseal_sse_tail_16
+.cfi_endproc
+.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
+#endif
diff --git a/gen/crypto/chacha20_poly1305_x86_64-win.asm b/gen/crypto/chacha20_poly1305_x86_64-win.asm
new file mode 100644
index 0000000..095689c
--- /dev/null
+++ b/gen/crypto/chacha20_poly1305_x86_64-win.asm
@@ -0,0 +1,8957 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+EXTERN OPENSSL_ia32cap_P
+
+chacha20_poly1305_constants:
+
+section .rdata rdata align=8
+ALIGN 64
+$L$chacha20_consts:
+ DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+ DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+$L$rol8:
+ DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+ DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+$L$rol16:
+ DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+ DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+$L$avx2_init:
+ DD 0,0,0,0
+$L$sse_inc:
+ DD 1,0,0,0
+$L$avx2_inc:
+ DD 2,0,0,0,2,0,0,0
+$L$clamp:
+ DQ 0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC
+ DQ 0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF
+ALIGN 16
+$L$and_masks:
+ DB 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+ DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+section .text
+
+
+
+ALIGN 64
+poly_hash_ad_internal:
+
+
+ xor r10,r10
+ xor r11,r11
+ xor r12,r12
+ cmp r8,13
+ jne NEAR $L$hash_ad_loop
+$L$poly_fast_tls_ad:
+
+ mov r10,QWORD[rcx]
+ mov r11,QWORD[5+rcx]
+ shr r11,24
+ mov r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ ret
+$L$hash_ad_loop:
+
+ cmp r8,16
+ jb NEAR $L$hash_ad_tail
+ add r10,QWORD[((0+0))+rcx]
+ adc r11,QWORD[((8+0))+rcx]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rcx,[16+rcx]
+ sub r8,16
+ jmp NEAR $L$hash_ad_loop
+$L$hash_ad_tail:
+ cmp r8,0
+ je NEAR $L$hash_ad_done
+
+ xor r13,r13
+ xor r14,r14
+ xor r15,r15
+ add rcx,r8
+$L$hash_ad_tail_loop:
+ shld r14,r13,8
+ shl r13,8
+ movzx r15,BYTE[((-1))+rcx]
+ xor r13,r15
+ dec rcx
+ dec r8
+ jne NEAR $L$hash_ad_tail_loop
+
+ add r10,r13
+ adc r11,r14
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+$L$hash_ad_done:
+ ret
+
+
+
+global chacha20_poly1305_open
+
+ALIGN 64
+chacha20_poly1305_open:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_chacha20_poly1305_open:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+
+
+ push r9
+
+ sub rsp,288 + 160 + 32
+
+
+ lea rbp,[32+rsp]
+ and rbp,-32
+
+ movaps XMMWORD[(0+0)+rbp],xmm6
+ movaps XMMWORD[(16+0)+rbp],xmm7
+ movaps XMMWORD[(32+0)+rbp],xmm8
+ movaps XMMWORD[(48+0)+rbp],xmm9
+ movaps XMMWORD[(64+0)+rbp],xmm10
+ movaps XMMWORD[(80+0)+rbp],xmm11
+ movaps XMMWORD[(96+0)+rbp],xmm12
+ movaps XMMWORD[(112+0)+rbp],xmm13
+ movaps XMMWORD[(128+0)+rbp],xmm14
+ movaps XMMWORD[(144+0)+rbp],xmm15
+
+ mov rbx,rdx
+ mov QWORD[((0+160+32))+rbp],r8
+ mov QWORD[((8+160+32))+rbp],rbx
+
+ mov eax,DWORD[((OPENSSL_ia32cap_P+8))]
+ and eax,288
+ xor eax,288
+ jz NEAR chacha20_poly1305_open_avx2
+
+ cmp rbx,128
+ jbe NEAR $L$open_sse_128
+
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqu xmm4,XMMWORD[r9]
+ movdqu xmm8,XMMWORD[16+r9]
+ movdqu xmm12,XMMWORD[32+r9]
+
+ movdqa xmm7,xmm12
+
+ movdqa XMMWORD[(160+48)+rbp],xmm4
+ movdqa XMMWORD[(160+64)+rbp],xmm8
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ mov r10,10
+$L$open_sse_init_rounds:
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+
+ dec r10
+ jne NEAR $L$open_sse_init_rounds
+
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+
+ pand xmm0,XMMWORD[$L$clamp]
+ movdqa XMMWORD[(160+0)+rbp],xmm0
+ movdqa XMMWORD[(160+16)+rbp],xmm4
+
+ mov r8,r8
+ call poly_hash_ad_internal
+$L$open_sse_main_loop:
+ cmp rbx,16*16
+ jb NEAR $L$open_sse_tail
+
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm1,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm4
+ movdqa xmm10,xmm8
+ movdqa xmm3,xmm0
+ movdqa xmm7,xmm4
+ movdqa xmm11,xmm8
+ movdqa xmm15,XMMWORD[((160+96))+rbp]
+ paddd xmm15,XMMWORD[$L$sse_inc]
+ movdqa xmm14,xmm15
+ paddd xmm14,XMMWORD[$L$sse_inc]
+ movdqa xmm13,xmm14
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm12,xmm13
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+ movdqa XMMWORD[(160+128)+rbp],xmm14
+ movdqa XMMWORD[(160+144)+rbp],xmm15
+
+
+
+ mov rcx,4
+ mov r8,rsi
+$L$open_sse_main_loop_rounds:
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,XMMWORD[$L$rol16]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ add r10,QWORD[((0+0))+r8]
+ adc r11,QWORD[((8+0))+r8]
+ adc r12,1
+
+ lea r8,[16+r8]
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,20
+ pslld xmm7,32-20
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,20
+ pslld xmm6,32-20
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,20
+ pslld xmm5,32-20
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,20
+ pslld xmm4,32-20
+ pxor xmm4,xmm8
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ movdqa xmm8,XMMWORD[$L$rol8]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,25
+ pslld xmm7,32-25
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,25
+ pslld xmm6,32-25
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,25
+ pslld xmm5,32-25
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,25
+ pslld xmm4,32-25
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+DB 102,15,58,15,255,4
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,12
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,XMMWORD[$L$rol16]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,20
+ pslld xmm7,32-20
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,20
+ pslld xmm6,32-20
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,20
+ pslld xmm5,32-20
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,20
+ pslld xmm4,32-20
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[$L$rol8]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,25
+ pslld xmm7,32-25
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,25
+ pslld xmm6,32-25
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,25
+ pslld xmm5,32-25
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,25
+ pslld xmm4,32-25
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+DB 102,15,58,15,255,12
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,4
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+
+ dec rcx
+ jge NEAR $L$open_sse_main_loop_rounds
+ add r10,QWORD[((0+0))+r8]
+ adc r11,QWORD[((8+0))+r8]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea r8,[16+r8]
+ cmp rcx,-6
+ jg NEAR $L$open_sse_main_loop_rounds
+ paddd xmm3,XMMWORD[$L$chacha20_consts]
+ paddd xmm7,XMMWORD[((160+48))+rbp]
+ paddd xmm11,XMMWORD[((160+64))+rbp]
+ paddd xmm15,XMMWORD[((160+144))+rbp]
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm6,XMMWORD[((160+48))+rbp]
+ paddd xmm10,XMMWORD[((160+64))+rbp]
+ paddd xmm14,XMMWORD[((160+128))+rbp]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+ movdqa XMMWORD[(160+80)+rbp],xmm12
+ movdqu xmm12,XMMWORD[((0 + 0))+rsi]
+ pxor xmm12,xmm3
+ movdqu XMMWORD[(0 + 0)+rdi],xmm12
+ movdqu xmm12,XMMWORD[((16 + 0))+rsi]
+ pxor xmm12,xmm7
+ movdqu XMMWORD[(16 + 0)+rdi],xmm12
+ movdqu xmm12,XMMWORD[((32 + 0))+rsi]
+ pxor xmm12,xmm11
+ movdqu XMMWORD[(32 + 0)+rdi],xmm12
+ movdqu xmm12,XMMWORD[((48 + 0))+rsi]
+ pxor xmm12,xmm15
+ movdqu XMMWORD[(48 + 0)+rdi],xmm12
+ movdqu xmm3,XMMWORD[((0 + 64))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 64))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 64))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 64))+rsi]
+ pxor xmm2,xmm3
+ pxor xmm6,xmm7
+ pxor xmm10,xmm11
+ pxor xmm15,xmm14
+ movdqu XMMWORD[(0 + 64)+rdi],xmm2
+ movdqu XMMWORD[(16 + 64)+rdi],xmm6
+ movdqu XMMWORD[(32 + 64)+rdi],xmm10
+ movdqu XMMWORD[(48 + 64)+rdi],xmm15
+ movdqu xmm3,XMMWORD[((0 + 128))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 128))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 128))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 128))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 128)+rdi],xmm1
+ movdqu XMMWORD[(16 + 128)+rdi],xmm5
+ movdqu XMMWORD[(32 + 128)+rdi],xmm9
+ movdqu XMMWORD[(48 + 128)+rdi],xmm15
+ movdqu xmm3,XMMWORD[((0 + 192))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 192))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 192))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 192))+rsi]
+ pxor xmm0,xmm3
+ pxor xmm4,xmm7
+ pxor xmm8,xmm11
+ pxor xmm15,XMMWORD[((160+80))+rbp]
+ movdqu XMMWORD[(0 + 192)+rdi],xmm0
+ movdqu XMMWORD[(16 + 192)+rdi],xmm4
+ movdqu XMMWORD[(32 + 192)+rdi],xmm8
+ movdqu XMMWORD[(48 + 192)+rdi],xmm15
+
+ lea rsi,[256+rsi]
+ lea rdi,[256+rdi]
+ sub rbx,16*16
+ jmp NEAR $L$open_sse_main_loop
+$L$open_sse_tail:
+
+ test rbx,rbx
+ jz NEAR $L$open_sse_finalize
+ cmp rbx,12*16
+ ja NEAR $L$open_sse_tail_256
+ cmp rbx,8*16
+ ja NEAR $L$open_sse_tail_192
+ cmp rbx,4*16
+ ja NEAR $L$open_sse_tail_128
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm12,XMMWORD[((160+96))+rbp]
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+
+ xor r8,r8
+ mov rcx,rbx
+ cmp rcx,16
+ jb NEAR $L$open_sse_tail_64_rounds
+$L$open_sse_tail_64_rounds_and_x1hash:
+ add r10,QWORD[((0+0))+r8*1+rsi]
+ adc r11,QWORD[((8+0))+r8*1+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ sub rcx,16
+$L$open_sse_tail_64_rounds:
+ add r8,16
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+
+ cmp rcx,16
+ jae NEAR $L$open_sse_tail_64_rounds_and_x1hash
+ cmp r8,10*16
+ jne NEAR $L$open_sse_tail_64_rounds
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+
+ jmp NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_128:
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm1,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm13,XMMWORD[((160+96))+rbp]
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm12,xmm13
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+
+ mov rcx,rbx
+ and rcx,-16
+ xor r8,r8
+$L$open_sse_tail_128_rounds_and_x1hash:
+ add r10,QWORD[((0+0))+r8*1+rsi]
+ adc r11,QWORD[((8+0))+r8*1+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+$L$open_sse_tail_128_rounds:
+ add r8,16
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+
+ cmp r8,rcx
+ jb NEAR $L$open_sse_tail_128_rounds_and_x1hash
+ cmp r8,10*16
+ jne NEAR $L$open_sse_tail_128_rounds
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+ movdqu xmm3,XMMWORD[((0 + 0))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 0))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 0))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 0))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 0)+rdi],xmm1
+ movdqu XMMWORD[(16 + 0)+rdi],xmm5
+ movdqu XMMWORD[(32 + 0)+rdi],xmm9
+ movdqu XMMWORD[(48 + 0)+rdi],xmm15
+
+ sub rbx,4*16
+ lea rsi,[64+rsi]
+ lea rdi,[64+rdi]
+ jmp NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_192:
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm1,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm4
+ movdqa xmm10,xmm8
+ movdqa xmm14,XMMWORD[((160+96))+rbp]
+ paddd xmm14,XMMWORD[$L$sse_inc]
+ movdqa xmm13,xmm14
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm12,xmm13
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+ movdqa XMMWORD[(160+128)+rbp],xmm14
+
+ mov rcx,rbx
+ mov r8,10*16
+ cmp rcx,10*16
+ cmovg rcx,r8
+ and rcx,-16
+ xor r8,r8
+$L$open_sse_tail_192_rounds_and_x1hash:
+ add r10,QWORD[((0+0))+r8*1+rsi]
+ adc r11,QWORD[((8+0))+r8*1+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+$L$open_sse_tail_192_rounds:
+ add r8,16
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+
+ cmp r8,rcx
+ jb NEAR $L$open_sse_tail_192_rounds_and_x1hash
+ cmp r8,10*16
+ jne NEAR $L$open_sse_tail_192_rounds
+ cmp rbx,11*16
+ jb NEAR $L$open_sse_tail_192_finish
+ add r10,QWORD[((0+160))+rsi]
+ adc r11,QWORD[((8+160))+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ cmp rbx,12*16
+ jb NEAR $L$open_sse_tail_192_finish
+ add r10,QWORD[((0+176))+rsi]
+ adc r11,QWORD[((8+176))+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+$L$open_sse_tail_192_finish:
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm6,XMMWORD[((160+48))+rbp]
+ paddd xmm10,XMMWORD[((160+64))+rbp]
+ paddd xmm14,XMMWORD[((160+128))+rbp]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+ movdqu xmm3,XMMWORD[((0 + 0))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 0))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 0))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 0))+rsi]
+ pxor xmm2,xmm3
+ pxor xmm6,xmm7
+ pxor xmm10,xmm11
+ pxor xmm15,xmm14
+ movdqu XMMWORD[(0 + 0)+rdi],xmm2
+ movdqu XMMWORD[(16 + 0)+rdi],xmm6
+ movdqu XMMWORD[(32 + 0)+rdi],xmm10
+ movdqu XMMWORD[(48 + 0)+rdi],xmm15
+ movdqu xmm3,XMMWORD[((0 + 64))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 64))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 64))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 64))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 64)+rdi],xmm1
+ movdqu XMMWORD[(16 + 64)+rdi],xmm5
+ movdqu XMMWORD[(32 + 64)+rdi],xmm9
+ movdqu XMMWORD[(48 + 64)+rdi],xmm15
+
+ sub rbx,8*16
+ lea rsi,[128+rsi]
+ lea rdi,[128+rdi]
+ jmp NEAR $L$open_sse_tail_64_dec_loop
+
+$L$open_sse_tail_256:
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm1,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm4
+ movdqa xmm10,xmm8
+ movdqa xmm3,xmm0
+ movdqa xmm7,xmm4
+ movdqa xmm11,xmm8
+ movdqa xmm15,XMMWORD[((160+96))+rbp]
+ paddd xmm15,XMMWORD[$L$sse_inc]
+ movdqa xmm14,xmm15
+ paddd xmm14,XMMWORD[$L$sse_inc]
+ movdqa xmm13,xmm14
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm12,xmm13
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+ movdqa XMMWORD[(160+128)+rbp],xmm14
+ movdqa XMMWORD[(160+144)+rbp],xmm15
+
+ xor r8,r8
+$L$open_sse_tail_256_rounds_and_x1hash:
+ add r10,QWORD[((0+0))+r8*1+rsi]
+ adc r11,QWORD[((8+0))+r8*1+rsi]
+ adc r12,1
+ movdqa XMMWORD[(160+80)+rbp],xmm11
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm11,xmm4
+ pslld xmm11,12
+ psrld xmm4,20
+ pxor xmm4,xmm11
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm11,xmm4
+ pslld xmm11,7
+ psrld xmm4,25
+ pxor xmm4,xmm11
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm11,xmm5
+ pslld xmm11,12
+ psrld xmm5,20
+ pxor xmm5,xmm11
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm11,xmm5
+ pslld xmm11,7
+ psrld xmm5,25
+ pxor xmm5,xmm11
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm11,xmm6
+ pslld xmm11,12
+ psrld xmm6,20
+ pxor xmm6,xmm11
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm11,xmm6
+ pslld xmm11,7
+ psrld xmm6,25
+ pxor xmm6,xmm11
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+ movdqa xmm11,XMMWORD[((160+80))+rbp]
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ movdqa XMMWORD[(160+80)+rbp],xmm9
+ paddd xmm3,xmm7
+ pxor xmm15,xmm3
+ pshufb xmm15,XMMWORD[$L$rol16]
+ paddd xmm11,xmm15
+ pxor xmm7,xmm11
+ movdqa xmm9,xmm7
+ pslld xmm9,12
+ psrld xmm7,20
+ pxor xmm7,xmm9
+ paddd xmm3,xmm7
+ pxor xmm15,xmm3
+ pshufb xmm15,XMMWORD[$L$rol8]
+ paddd xmm11,xmm15
+ pxor xmm7,xmm11
+ movdqa xmm9,xmm7
+ pslld xmm9,7
+ psrld xmm7,25
+ pxor xmm7,xmm9
+DB 102,15,58,15,255,4
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,12
+ movdqa xmm9,XMMWORD[((160+80))+rbp]
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ movdqa XMMWORD[(160+80)+rbp],xmm11
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm11,xmm4
+ pslld xmm11,12
+ psrld xmm4,20
+ pxor xmm4,xmm11
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm11,xmm4
+ pslld xmm11,7
+ psrld xmm4,25
+ pxor xmm4,xmm11
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm11,xmm5
+ pslld xmm11,12
+ psrld xmm5,20
+ pxor xmm5,xmm11
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm11,xmm5
+ pslld xmm11,7
+ psrld xmm5,25
+ pxor xmm5,xmm11
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm11,xmm6
+ pslld xmm11,12
+ psrld xmm6,20
+ pxor xmm6,xmm11
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm11,xmm6
+ pslld xmm11,7
+ psrld xmm6,25
+ pxor xmm6,xmm11
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+ movdqa xmm11,XMMWORD[((160+80))+rbp]
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ movdqa XMMWORD[(160+80)+rbp],xmm9
+ paddd xmm3,xmm7
+ pxor xmm15,xmm3
+ pshufb xmm15,XMMWORD[$L$rol16]
+ paddd xmm11,xmm15
+ pxor xmm7,xmm11
+ movdqa xmm9,xmm7
+ pslld xmm9,12
+ psrld xmm7,20
+ pxor xmm7,xmm9
+ paddd xmm3,xmm7
+ pxor xmm15,xmm3
+ pshufb xmm15,XMMWORD[$L$rol8]
+ paddd xmm11,xmm15
+ pxor xmm7,xmm11
+ movdqa xmm9,xmm7
+ pslld xmm9,7
+ psrld xmm7,25
+ pxor xmm7,xmm9
+DB 102,15,58,15,255,12
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,4
+ movdqa xmm9,XMMWORD[((160+80))+rbp]
+
+ add r8,16
+ cmp r8,10*16
+ jb NEAR $L$open_sse_tail_256_rounds_and_x1hash
+
+ mov rcx,rbx
+ and rcx,-16
+$L$open_sse_tail_256_hash:
+ add r10,QWORD[((0+0))+r8*1+rsi]
+ adc r11,QWORD[((8+0))+r8*1+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ add r8,16
+ cmp r8,rcx
+ jb NEAR $L$open_sse_tail_256_hash
+ paddd xmm3,XMMWORD[$L$chacha20_consts]
+ paddd xmm7,XMMWORD[((160+48))+rbp]
+ paddd xmm11,XMMWORD[((160+64))+rbp]
+ paddd xmm15,XMMWORD[((160+144))+rbp]
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm6,XMMWORD[((160+48))+rbp]
+ paddd xmm10,XMMWORD[((160+64))+rbp]
+ paddd xmm14,XMMWORD[((160+128))+rbp]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+ movdqa XMMWORD[(160+80)+rbp],xmm12
+ movdqu xmm12,XMMWORD[((0 + 0))+rsi]
+ pxor xmm12,xmm3
+ movdqu XMMWORD[(0 + 0)+rdi],xmm12
+ movdqu xmm12,XMMWORD[((16 + 0))+rsi]
+ pxor xmm12,xmm7
+ movdqu XMMWORD[(16 + 0)+rdi],xmm12
+ movdqu xmm12,XMMWORD[((32 + 0))+rsi]
+ pxor xmm12,xmm11
+ movdqu XMMWORD[(32 + 0)+rdi],xmm12
+ movdqu xmm12,XMMWORD[((48 + 0))+rsi]
+ pxor xmm12,xmm15
+ movdqu XMMWORD[(48 + 0)+rdi],xmm12
+ movdqu xmm3,XMMWORD[((0 + 64))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 64))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 64))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 64))+rsi]
+ pxor xmm2,xmm3
+ pxor xmm6,xmm7
+ pxor xmm10,xmm11
+ pxor xmm15,xmm14
+ movdqu XMMWORD[(0 + 64)+rdi],xmm2
+ movdqu XMMWORD[(16 + 64)+rdi],xmm6
+ movdqu XMMWORD[(32 + 64)+rdi],xmm10
+ movdqu XMMWORD[(48 + 64)+rdi],xmm15
+ movdqu xmm3,XMMWORD[((0 + 128))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 128))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 128))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 128))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 128)+rdi],xmm1
+ movdqu XMMWORD[(16 + 128)+rdi],xmm5
+ movdqu XMMWORD[(32 + 128)+rdi],xmm9
+ movdqu XMMWORD[(48 + 128)+rdi],xmm15
+
+ movdqa xmm12,XMMWORD[((160+80))+rbp]
+ sub rbx,12*16
+ lea rsi,[192+rsi]
+ lea rdi,[192+rdi]
+
+
+$L$open_sse_tail_64_dec_loop:
+ cmp rbx,16
+ jb NEAR $L$open_sse_tail_16_init
+ sub rbx,16
+ movdqu xmm3,XMMWORD[rsi]
+ pxor xmm0,xmm3
+ movdqu XMMWORD[rdi],xmm0
+ lea rsi,[16+rsi]
+ lea rdi,[16+rdi]
+ movdqa xmm0,xmm4
+ movdqa xmm4,xmm8
+ movdqa xmm8,xmm12
+ jmp NEAR $L$open_sse_tail_64_dec_loop
+$L$open_sse_tail_16_init:
+ movdqa xmm1,xmm0
+
+
+$L$open_sse_tail_16:
+ test rbx,rbx
+ jz NEAR $L$open_sse_finalize
+
+
+
+ pxor xmm3,xmm3
+ lea rsi,[((-1))+rbx*1+rsi]
+ mov r8,rbx
+$L$open_sse_tail_16_compose:
+ pslldq xmm3,1
+ pinsrb xmm3,BYTE[rsi],0
+ sub rsi,1
+ sub r8,1
+ jnz NEAR $L$open_sse_tail_16_compose
+
+DB 102,73,15,126,221
+ pextrq r14,xmm3,1
+
+ pxor xmm3,xmm1
+
+
+$L$open_sse_tail_16_extract:
+ pextrb XMMWORD[rdi],xmm3,0
+ psrldq xmm3,1
+ add rdi,1
+ sub rbx,1
+ jne NEAR $L$open_sse_tail_16_extract
+
+ add r10,r13
+ adc r11,r14
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+$L$open_sse_finalize:
+ add r10,QWORD[((0+160+32))+rbp]
+ adc r11,QWORD[((8+160+32))+rbp]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+ mov r13,r10
+ mov r14,r11
+ mov r15,r12
+ sub r10,-5
+ sbb r11,-1
+ sbb r12,3
+ cmovc r10,r13
+ cmovc r11,r14
+ cmovc r12,r15
+
+ add r10,QWORD[((0+160+16))+rbp]
+ adc r11,QWORD[((8+160+16))+rbp]
+
+ movaps xmm6,XMMWORD[((0+0))+rbp]
+ movaps xmm7,XMMWORD[((16+0))+rbp]
+ movaps xmm8,XMMWORD[((32+0))+rbp]
+ movaps xmm9,XMMWORD[((48+0))+rbp]
+ movaps xmm10,XMMWORD[((64+0))+rbp]
+ movaps xmm11,XMMWORD[((80+0))+rbp]
+ movaps xmm12,XMMWORD[((96+0))+rbp]
+ movaps xmm13,XMMWORD[((112+0))+rbp]
+ movaps xmm14,XMMWORD[((128+0))+rbp]
+ movaps xmm15,XMMWORD[((144+0))+rbp]
+
+
+ add rsp,288 + 160 + 32
+
+
+ pop r9
+
+ mov QWORD[r9],r10
+ mov QWORD[8+r9],r11
+ pop r15
+
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ pop rbx
+
+ pop rbp
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$open_sse_128:
+
+ movdqu xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm0
+ movdqu xmm4,XMMWORD[r9]
+ movdqa xmm5,xmm4
+ movdqa xmm6,xmm4
+ movdqu xmm8,XMMWORD[16+r9]
+ movdqa xmm9,xmm8
+ movdqa xmm10,xmm8
+ movdqu xmm12,XMMWORD[32+r9]
+ movdqa xmm13,xmm12
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm14,xmm13
+ paddd xmm14,XMMWORD[$L$sse_inc]
+ movdqa xmm7,xmm4
+ movdqa xmm11,xmm8
+ movdqa xmm15,xmm13
+ mov r10,10
+
+$L$open_sse_128_rounds:
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+
+ dec r10
+ jnz NEAR $L$open_sse_128_rounds
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,xmm7
+ paddd xmm5,xmm7
+ paddd xmm6,xmm7
+ paddd xmm9,xmm11
+ paddd xmm10,xmm11
+ paddd xmm13,xmm15
+ paddd xmm15,XMMWORD[$L$sse_inc]
+ paddd xmm14,xmm15
+
+ pand xmm0,XMMWORD[$L$clamp]
+ movdqa XMMWORD[(160+0)+rbp],xmm0
+ movdqa XMMWORD[(160+16)+rbp],xmm4
+
+ mov r8,r8
+ call poly_hash_ad_internal
+$L$open_sse_128_xor_hash:
+ cmp rbx,16
+ jb NEAR $L$open_sse_tail_16
+ sub rbx,16
+ add r10,QWORD[((0+0))+rsi]
+ adc r11,QWORD[((8+0))+rsi]
+ adc r12,1
+
+
+ movdqu xmm3,XMMWORD[rsi]
+ pxor xmm1,xmm3
+ movdqu XMMWORD[rdi],xmm1
+ lea rsi,[16+rsi]
+ lea rdi,[16+rdi]
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+ movdqa xmm1,xmm5
+ movdqa xmm5,xmm9
+ movdqa xmm9,xmm13
+ movdqa xmm13,xmm2
+ movdqa xmm2,xmm6
+ movdqa xmm6,xmm10
+ movdqa xmm10,xmm14
+ jmp NEAR $L$open_sse_128_xor_hash
+$L$SEH_end_chacha20_poly1305_open:
+
+
+
+
+
+
+
+
+global chacha20_poly1305_seal
+
+ALIGN 64
+chacha20_poly1305_seal:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_chacha20_poly1305_seal:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+
+_CET_ENDBR
+ push rbp
+
+ push rbx
+
+ push r12
+
+ push r13
+
+ push r14
+
+ push r15
+
+
+
+ push r9
+
+ sub rsp,288 + 160 + 32
+
+ lea rbp,[32+rsp]
+ and rbp,-32
+
+ movaps XMMWORD[(0+0)+rbp],xmm6
+ movaps XMMWORD[(16+0)+rbp],xmm7
+ movaps XMMWORD[(32+0)+rbp],xmm8
+ movaps XMMWORD[(48+0)+rbp],xmm9
+ movaps XMMWORD[(64+0)+rbp],xmm10
+ movaps XMMWORD[(80+0)+rbp],xmm11
+ movaps XMMWORD[(96+0)+rbp],xmm12
+ movaps XMMWORD[(112+0)+rbp],xmm13
+ movaps XMMWORD[(128+0)+rbp],xmm14
+ movaps XMMWORD[(144+0)+rbp],xmm15
+
+ mov rbx,QWORD[56+r9]
+ add rbx,rdx
+ mov QWORD[((0+160+32))+rbp],r8
+ mov QWORD[((8+160+32))+rbp],rbx
+ mov rbx,rdx
+
+ mov eax,DWORD[((OPENSSL_ia32cap_P+8))]
+ and eax,288
+ xor eax,288
+ jz NEAR chacha20_poly1305_seal_avx2
+
+ cmp rbx,128
+ jbe NEAR $L$seal_sse_128
+
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqu xmm4,XMMWORD[r9]
+ movdqu xmm8,XMMWORD[16+r9]
+ movdqu xmm12,XMMWORD[32+r9]
+
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm0
+ movdqa xmm3,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm6,xmm4
+ movdqa xmm7,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm10,xmm8
+ movdqa xmm11,xmm8
+ movdqa xmm15,xmm12
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa xmm14,xmm12
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa xmm13,xmm12
+ paddd xmm12,XMMWORD[$L$sse_inc]
+
+ movdqa XMMWORD[(160+48)+rbp],xmm4
+ movdqa XMMWORD[(160+64)+rbp],xmm8
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+ movdqa XMMWORD[(160+128)+rbp],xmm14
+ movdqa XMMWORD[(160+144)+rbp],xmm15
+ mov r10,10
+$L$seal_sse_init_rounds:
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,XMMWORD[$L$rol16]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,20
+ pslld xmm7,32-20
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,20
+ pslld xmm6,32-20
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,20
+ pslld xmm5,32-20
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,20
+ pslld xmm4,32-20
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[$L$rol8]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,25
+ pslld xmm7,32-25
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,25
+ pslld xmm6,32-25
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,25
+ pslld xmm5,32-25
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,25
+ pslld xmm4,32-25
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+DB 102,15,58,15,255,4
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,12
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,XMMWORD[$L$rol16]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,20
+ pslld xmm7,32-20
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,20
+ pslld xmm6,32-20
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,20
+ pslld xmm5,32-20
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,20
+ pslld xmm4,32-20
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[$L$rol8]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,25
+ pslld xmm7,32-25
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,25
+ pslld xmm6,32-25
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,25
+ pslld xmm5,32-25
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,25
+ pslld xmm4,32-25
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+DB 102,15,58,15,255,12
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,4
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+
+ dec r10
+ jnz NEAR $L$seal_sse_init_rounds
+ paddd xmm3,XMMWORD[$L$chacha20_consts]
+ paddd xmm7,XMMWORD[((160+48))+rbp]
+ paddd xmm11,XMMWORD[((160+64))+rbp]
+ paddd xmm15,XMMWORD[((160+144))+rbp]
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm6,XMMWORD[((160+48))+rbp]
+ paddd xmm10,XMMWORD[((160+64))+rbp]
+ paddd xmm14,XMMWORD[((160+128))+rbp]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+
+
+ pand xmm3,XMMWORD[$L$clamp]
+ movdqa XMMWORD[(160+0)+rbp],xmm3
+ movdqa XMMWORD[(160+16)+rbp],xmm7
+
+ mov r8,r8
+ call poly_hash_ad_internal
+ movdqu xmm3,XMMWORD[((0 + 0))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 0))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 0))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 0))+rsi]
+ pxor xmm2,xmm3
+ pxor xmm6,xmm7
+ pxor xmm10,xmm11
+ pxor xmm15,xmm14
+ movdqu XMMWORD[(0 + 0)+rdi],xmm2
+ movdqu XMMWORD[(16 + 0)+rdi],xmm6
+ movdqu XMMWORD[(32 + 0)+rdi],xmm10
+ movdqu XMMWORD[(48 + 0)+rdi],xmm15
+ movdqu xmm3,XMMWORD[((0 + 64))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 64))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 64))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 64))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 64)+rdi],xmm1
+ movdqu XMMWORD[(16 + 64)+rdi],xmm5
+ movdqu XMMWORD[(32 + 64)+rdi],xmm9
+ movdqu XMMWORD[(48 + 64)+rdi],xmm15
+
+ cmp rbx,12*16
+ ja NEAR $L$seal_sse_main_init
+ mov rcx,8*16
+ sub rbx,8*16
+ lea rsi,[128+rsi]
+ jmp NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_init:
+ movdqu xmm3,XMMWORD[((0 + 128))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 128))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 128))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 128))+rsi]
+ pxor xmm0,xmm3
+ pxor xmm4,xmm7
+ pxor xmm8,xmm11
+ pxor xmm15,xmm12
+ movdqu XMMWORD[(0 + 128)+rdi],xmm0
+ movdqu XMMWORD[(16 + 128)+rdi],xmm4
+ movdqu XMMWORD[(32 + 128)+rdi],xmm8
+ movdqu XMMWORD[(48 + 128)+rdi],xmm15
+
+ mov rcx,12*16
+ sub rbx,12*16
+ lea rsi,[192+rsi]
+ mov rcx,2
+ mov r8,8
+ cmp rbx,4*16
+ jbe NEAR $L$seal_sse_tail_64
+ cmp rbx,8*16
+ jbe NEAR $L$seal_sse_tail_128
+ cmp rbx,12*16
+ jbe NEAR $L$seal_sse_tail_192
+
+$L$seal_sse_main_loop:
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm1,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm4
+ movdqa xmm10,xmm8
+ movdqa xmm3,xmm0
+ movdqa xmm7,xmm4
+ movdqa xmm11,xmm8
+ movdqa xmm15,XMMWORD[((160+96))+rbp]
+ paddd xmm15,XMMWORD[$L$sse_inc]
+ movdqa xmm14,xmm15
+ paddd xmm14,XMMWORD[$L$sse_inc]
+ movdqa xmm13,xmm14
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm12,xmm13
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+ movdqa XMMWORD[(160+128)+rbp],xmm14
+ movdqa XMMWORD[(160+144)+rbp],xmm15
+
+ALIGN 32
+$L$seal_sse_main_rounds:
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,XMMWORD[$L$rol16]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,20
+ pslld xmm7,32-20
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,20
+ pslld xmm6,32-20
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,20
+ pslld xmm5,32-20
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,20
+ pslld xmm4,32-20
+ pxor xmm4,xmm8
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ movdqa xmm8,XMMWORD[$L$rol8]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,25
+ pslld xmm7,32-25
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,25
+ pslld xmm6,32-25
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,25
+ pslld xmm5,32-25
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,25
+ pslld xmm4,32-25
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+DB 102,15,58,15,255,4
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,12
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,XMMWORD[$L$rol16]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,20
+ pslld xmm7,32-20
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,20
+ pslld xmm6,32-20
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,20
+ pslld xmm5,32-20
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,20
+ pslld xmm4,32-20
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[$L$rol8]
+ paddd xmm3,xmm7
+ paddd xmm2,xmm6
+ paddd xmm1,xmm5
+ paddd xmm0,xmm4
+ pxor xmm15,xmm3
+ pxor xmm14,xmm2
+ pxor xmm13,xmm1
+ pxor xmm12,xmm0
+DB 102,69,15,56,0,248
+DB 102,69,15,56,0,240
+DB 102,69,15,56,0,232
+DB 102,69,15,56,0,224
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+ paddd xmm11,xmm15
+ paddd xmm10,xmm14
+ paddd xmm9,xmm13
+ paddd xmm8,xmm12
+ pxor xmm7,xmm11
+ pxor xmm6,xmm10
+ pxor xmm5,xmm9
+ pxor xmm4,xmm8
+ movdqa XMMWORD[(160+80)+rbp],xmm8
+ movdqa xmm8,xmm7
+ psrld xmm8,25
+ pslld xmm7,32-25
+ pxor xmm7,xmm8
+ movdqa xmm8,xmm6
+ psrld xmm8,25
+ pslld xmm6,32-25
+ pxor xmm6,xmm8
+ movdqa xmm8,xmm5
+ psrld xmm8,25
+ pslld xmm5,32-25
+ pxor xmm5,xmm8
+ movdqa xmm8,xmm4
+ psrld xmm8,25
+ pslld xmm4,32-25
+ pxor xmm4,xmm8
+ movdqa xmm8,XMMWORD[((160+80))+rbp]
+DB 102,15,58,15,255,12
+DB 102,69,15,58,15,219,8
+DB 102,69,15,58,15,255,4
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+
+ lea rdi,[16+rdi]
+ dec r8
+ jge NEAR $L$seal_sse_main_rounds
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+ dec rcx
+ jg NEAR $L$seal_sse_main_rounds
+ paddd xmm3,XMMWORD[$L$chacha20_consts]
+ paddd xmm7,XMMWORD[((160+48))+rbp]
+ paddd xmm11,XMMWORD[((160+64))+rbp]
+ paddd xmm15,XMMWORD[((160+144))+rbp]
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm6,XMMWORD[((160+48))+rbp]
+ paddd xmm10,XMMWORD[((160+64))+rbp]
+ paddd xmm14,XMMWORD[((160+128))+rbp]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+
+ movdqa XMMWORD[(160+80)+rbp],xmm14
+ movdqa XMMWORD[(160+80)+rbp],xmm14
+ movdqu xmm14,XMMWORD[((0 + 0))+rsi]
+ pxor xmm14,xmm3
+ movdqu XMMWORD[(0 + 0)+rdi],xmm14
+ movdqu xmm14,XMMWORD[((16 + 0))+rsi]
+ pxor xmm14,xmm7
+ movdqu XMMWORD[(16 + 0)+rdi],xmm14
+ movdqu xmm14,XMMWORD[((32 + 0))+rsi]
+ pxor xmm14,xmm11
+ movdqu XMMWORD[(32 + 0)+rdi],xmm14
+ movdqu xmm14,XMMWORD[((48 + 0))+rsi]
+ pxor xmm14,xmm15
+ movdqu XMMWORD[(48 + 0)+rdi],xmm14
+
+ movdqa xmm14,XMMWORD[((160+80))+rbp]
+ movdqu xmm3,XMMWORD[((0 + 64))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 64))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 64))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 64))+rsi]
+ pxor xmm2,xmm3
+ pxor xmm6,xmm7
+ pxor xmm10,xmm11
+ pxor xmm15,xmm14
+ movdqu XMMWORD[(0 + 64)+rdi],xmm2
+ movdqu XMMWORD[(16 + 64)+rdi],xmm6
+ movdqu XMMWORD[(32 + 64)+rdi],xmm10
+ movdqu XMMWORD[(48 + 64)+rdi],xmm15
+ movdqu xmm3,XMMWORD[((0 + 128))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 128))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 128))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 128))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 128)+rdi],xmm1
+ movdqu XMMWORD[(16 + 128)+rdi],xmm5
+ movdqu XMMWORD[(32 + 128)+rdi],xmm9
+ movdqu XMMWORD[(48 + 128)+rdi],xmm15
+
+ cmp rbx,16*16
+ ja NEAR $L$seal_sse_main_loop_xor
+
+ mov rcx,12*16
+ sub rbx,12*16
+ lea rsi,[192+rsi]
+ jmp NEAR $L$seal_sse_128_tail_hash
+$L$seal_sse_main_loop_xor:
+ movdqu xmm3,XMMWORD[((0 + 192))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 192))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 192))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 192))+rsi]
+ pxor xmm0,xmm3
+ pxor xmm4,xmm7
+ pxor xmm8,xmm11
+ pxor xmm15,xmm12
+ movdqu XMMWORD[(0 + 192)+rdi],xmm0
+ movdqu XMMWORD[(16 + 192)+rdi],xmm4
+ movdqu XMMWORD[(32 + 192)+rdi],xmm8
+ movdqu XMMWORD[(48 + 192)+rdi],xmm15
+
+ lea rsi,[256+rsi]
+ sub rbx,16*16
+ mov rcx,6
+ mov r8,4
+ cmp rbx,12*16
+ jg NEAR $L$seal_sse_main_loop
+ mov rcx,rbx
+ test rbx,rbx
+ je NEAR $L$seal_sse_128_tail_hash
+ mov rcx,6
+ cmp rbx,8*16
+ ja NEAR $L$seal_sse_tail_192
+ cmp rbx,4*16
+ ja NEAR $L$seal_sse_tail_128
+
+$L$seal_sse_tail_64:
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm12,XMMWORD[((160+96))+rbp]
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+
+$L$seal_sse_tail_64_rounds_and_x2hash:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+$L$seal_sse_tail_64_rounds_and_x1hash:
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+ dec rcx
+ jg NEAR $L$seal_sse_tail_64_rounds_and_x2hash
+ dec r8
+ jge NEAR $L$seal_sse_tail_64_rounds_and_x1hash
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+
+ jmp NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_128:
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm1,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm13,XMMWORD[((160+96))+rbp]
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm12,xmm13
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+
+$L$seal_sse_tail_128_rounds_and_x2hash:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+$L$seal_sse_tail_128_rounds_and_x1hash:
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+
+ lea rdi,[16+rdi]
+ dec rcx
+ jg NEAR $L$seal_sse_tail_128_rounds_and_x2hash
+ dec r8
+ jge NEAR $L$seal_sse_tail_128_rounds_and_x1hash
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+ movdqu xmm3,XMMWORD[((0 + 0))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 0))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 0))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 0))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 0)+rdi],xmm1
+ movdqu XMMWORD[(16 + 0)+rdi],xmm5
+ movdqu XMMWORD[(32 + 0)+rdi],xmm9
+ movdqu XMMWORD[(48 + 0)+rdi],xmm15
+
+ mov rcx,4*16
+ sub rbx,4*16
+ lea rsi,[64+rsi]
+ jmp NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_tail_192:
+ movdqa xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm4,XMMWORD[((160+48))+rbp]
+ movdqa xmm8,XMMWORD[((160+64))+rbp]
+ movdqa xmm1,xmm0
+ movdqa xmm5,xmm4
+ movdqa xmm9,xmm8
+ movdqa xmm2,xmm0
+ movdqa xmm6,xmm4
+ movdqa xmm10,xmm8
+ movdqa xmm14,XMMWORD[((160+96))+rbp]
+ paddd xmm14,XMMWORD[$L$sse_inc]
+ movdqa xmm13,xmm14
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm12,xmm13
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa XMMWORD[(160+96)+rbp],xmm12
+ movdqa XMMWORD[(160+112)+rbp],xmm13
+ movdqa XMMWORD[(160+128)+rbp],xmm14
+
+$L$seal_sse_tail_192_rounds_and_x2hash:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+$L$seal_sse_tail_192_rounds_and_x1hash:
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+
+ lea rdi,[16+rdi]
+ dec rcx
+ jg NEAR $L$seal_sse_tail_192_rounds_and_x2hash
+ dec r8
+ jge NEAR $L$seal_sse_tail_192_rounds_and_x1hash
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm6,XMMWORD[((160+48))+rbp]
+ paddd xmm10,XMMWORD[((160+64))+rbp]
+ paddd xmm14,XMMWORD[((160+128))+rbp]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm5,XMMWORD[((160+48))+rbp]
+ paddd xmm9,XMMWORD[((160+64))+rbp]
+ paddd xmm13,XMMWORD[((160+112))+rbp]
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,XMMWORD[((160+48))+rbp]
+ paddd xmm8,XMMWORD[((160+64))+rbp]
+ paddd xmm12,XMMWORD[((160+96))+rbp]
+ movdqu xmm3,XMMWORD[((0 + 0))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 0))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 0))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 0))+rsi]
+ pxor xmm2,xmm3
+ pxor xmm6,xmm7
+ pxor xmm10,xmm11
+ pxor xmm15,xmm14
+ movdqu XMMWORD[(0 + 0)+rdi],xmm2
+ movdqu XMMWORD[(16 + 0)+rdi],xmm6
+ movdqu XMMWORD[(32 + 0)+rdi],xmm10
+ movdqu XMMWORD[(48 + 0)+rdi],xmm15
+ movdqu xmm3,XMMWORD[((0 + 64))+rsi]
+ movdqu xmm7,XMMWORD[((16 + 64))+rsi]
+ movdqu xmm11,XMMWORD[((32 + 64))+rsi]
+ movdqu xmm15,XMMWORD[((48 + 64))+rsi]
+ pxor xmm1,xmm3
+ pxor xmm5,xmm7
+ pxor xmm9,xmm11
+ pxor xmm15,xmm13
+ movdqu XMMWORD[(0 + 64)+rdi],xmm1
+ movdqu XMMWORD[(16 + 64)+rdi],xmm5
+ movdqu XMMWORD[(32 + 64)+rdi],xmm9
+ movdqu XMMWORD[(48 + 64)+rdi],xmm15
+
+ mov rcx,8*16
+ sub rbx,8*16
+ lea rsi,[128+rsi]
+
+$L$seal_sse_128_tail_hash:
+ cmp rcx,16
+ jb NEAR $L$seal_sse_128_tail_xor
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ sub rcx,16
+ lea rdi,[16+rdi]
+ jmp NEAR $L$seal_sse_128_tail_hash
+
+$L$seal_sse_128_tail_xor:
+ cmp rbx,16
+ jb NEAR $L$seal_sse_tail_16
+ sub rbx,16
+
+ movdqu xmm3,XMMWORD[rsi]
+ pxor xmm0,xmm3
+ movdqu XMMWORD[rdi],xmm0
+
+ add r10,QWORD[rdi]
+ adc r11,QWORD[8+rdi]
+ adc r12,1
+ lea rsi,[16+rsi]
+ lea rdi,[16+rdi]
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+ movdqa xmm0,xmm4
+ movdqa xmm4,xmm8
+ movdqa xmm8,xmm12
+ movdqa xmm12,xmm1
+ movdqa xmm1,xmm5
+ movdqa xmm5,xmm9
+ movdqa xmm9,xmm13
+ jmp NEAR $L$seal_sse_128_tail_xor
+
+$L$seal_sse_tail_16:
+ test rbx,rbx
+ jz NEAR $L$process_blocks_of_extra_in
+
+ mov r8,rbx
+ mov rcx,rbx
+ lea rsi,[((-1))+rbx*1+rsi]
+ pxor xmm15,xmm15
+$L$seal_sse_tail_16_compose:
+ pslldq xmm15,1
+ pinsrb xmm15,BYTE[rsi],0
+ lea rsi,[((-1))+rsi]
+ dec rcx
+ jne NEAR $L$seal_sse_tail_16_compose
+
+
+ pxor xmm15,xmm0
+
+
+ mov rcx,rbx
+ movdqu xmm0,xmm15
+$L$seal_sse_tail_16_extract:
+ pextrb XMMWORD[rdi],xmm0,0
+ psrldq xmm0,1
+ add rdi,1
+ sub rcx,1
+ jnz NEAR $L$seal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+ mov r9,QWORD[((288 + 160 + 32))+rsp]
+ mov r14,QWORD[56+r9]
+ mov r13,QWORD[48+r9]
+ test r14,r14
+ jz NEAR $L$process_partial_block
+
+ mov r15,16
+ sub r15,rbx
+ cmp r14,r15
+
+ jge NEAR $L$load_extra_in
+ mov r15,r14
+
+$L$load_extra_in:
+
+
+ lea rsi,[((-1))+r15*1+r13]
+
+
+ add r13,r15
+ sub r14,r15
+ mov QWORD[48+r9],r13
+ mov QWORD[56+r9],r14
+
+
+
+ add r8,r15
+
+
+ pxor xmm11,xmm11
+$L$load_extra_load_loop:
+ pslldq xmm11,1
+ pinsrb xmm11,BYTE[rsi],0
+ lea rsi,[((-1))+rsi]
+ sub r15,1
+ jnz NEAR $L$load_extra_load_loop
+
+
+
+
+ mov r15,rbx
+
+$L$load_extra_shift_loop:
+ pslldq xmm11,1
+ sub r15,1
+ jnz NEAR $L$load_extra_shift_loop
+
+
+
+
+ lea r15,[$L$and_masks]
+ shl rbx,4
+ pand xmm15,XMMWORD[((-16))+rbx*1+r15]
+
+
+ por xmm15,xmm11
+
+
+
+DB 102,77,15,126,253
+ pextrq r14,xmm15,1
+ add r10,r13
+ adc r11,r14
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+$L$process_blocks_of_extra_in:
+
+ mov r9,QWORD[((288+32+160 ))+rsp]
+ mov rsi,QWORD[48+r9]
+ mov r8,QWORD[56+r9]
+ mov rcx,r8
+ shr r8,4
+
+$L$process_extra_hash_loop:
+ jz NEAR process_extra_in_trailer
+ add r10,QWORD[((0+0))+rsi]
+ adc r11,QWORD[((8+0))+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rsi,[16+rsi]
+ sub r8,1
+ jmp NEAR $L$process_extra_hash_loop
+process_extra_in_trailer:
+ and rcx,15
+ mov rbx,rcx
+ jz NEAR $L$do_length_block
+ lea rsi,[((-1))+rcx*1+rsi]
+
+$L$process_extra_in_trailer_load:
+ pslldq xmm15,1
+ pinsrb xmm15,BYTE[rsi],0
+ lea rsi,[((-1))+rsi]
+ sub rcx,1
+ jnz NEAR $L$process_extra_in_trailer_load
+
+$L$process_partial_block:
+
+ lea r15,[$L$and_masks]
+ shl rbx,4
+ pand xmm15,XMMWORD[((-16))+rbx*1+r15]
+DB 102,77,15,126,253
+ pextrq r14,xmm15,1
+ add r10,r13
+ adc r11,r14
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+$L$do_length_block:
+ add r10,QWORD[((0+160+32))+rbp]
+ adc r11,QWORD[((8+160+32))+rbp]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+ mov r13,r10
+ mov r14,r11
+ mov r15,r12
+ sub r10,-5
+ sbb r11,-1
+ sbb r12,3
+ cmovc r10,r13
+ cmovc r11,r14
+ cmovc r12,r15
+
+ add r10,QWORD[((0+160+16))+rbp]
+ adc r11,QWORD[((8+160+16))+rbp]
+
+ movaps xmm6,XMMWORD[((0+0))+rbp]
+ movaps xmm7,XMMWORD[((16+0))+rbp]
+ movaps xmm8,XMMWORD[((32+0))+rbp]
+ movaps xmm9,XMMWORD[((48+0))+rbp]
+ movaps xmm10,XMMWORD[((64+0))+rbp]
+ movaps xmm11,XMMWORD[((80+0))+rbp]
+ movaps xmm12,XMMWORD[((96+0))+rbp]
+ movaps xmm13,XMMWORD[((112+0))+rbp]
+ movaps xmm14,XMMWORD[((128+0))+rbp]
+ movaps xmm15,XMMWORD[((144+0))+rbp]
+
+
+ add rsp,288 + 160 + 32
+
+
+ pop r9
+
+ mov QWORD[r9],r10
+ mov QWORD[8+r9],r11
+ pop r15
+
+ pop r14
+
+ pop r13
+
+ pop r12
+
+ pop rbx
+
+ pop rbp
+
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ ret
+
+$L$seal_sse_128:
+
+ movdqu xmm0,XMMWORD[$L$chacha20_consts]
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm0
+ movdqu xmm4,XMMWORD[r9]
+ movdqa xmm5,xmm4
+ movdqa xmm6,xmm4
+ movdqu xmm8,XMMWORD[16+r9]
+ movdqa xmm9,xmm8
+ movdqa xmm10,xmm8
+ movdqu xmm14,XMMWORD[32+r9]
+ movdqa xmm12,xmm14
+ paddd xmm12,XMMWORD[$L$sse_inc]
+ movdqa xmm13,xmm12
+ paddd xmm13,XMMWORD[$L$sse_inc]
+ movdqa xmm7,xmm4
+ movdqa xmm11,xmm8
+ movdqa xmm15,xmm12
+ mov r10,10
+
+$L$seal_sse_128_rounds:
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,4
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,12
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,4
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,12
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,4
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,12
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol16]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,12
+ psrld xmm4,20
+ pxor xmm4,xmm3
+ paddd xmm0,xmm4
+ pxor xmm12,xmm0
+ pshufb xmm12,XMMWORD[$L$rol8]
+ paddd xmm8,xmm12
+ pxor xmm4,xmm8
+ movdqa xmm3,xmm4
+ pslld xmm3,7
+ psrld xmm4,25
+ pxor xmm4,xmm3
+DB 102,15,58,15,228,12
+DB 102,69,15,58,15,192,8
+DB 102,69,15,58,15,228,4
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol16]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,12
+ psrld xmm5,20
+ pxor xmm5,xmm3
+ paddd xmm1,xmm5
+ pxor xmm13,xmm1
+ pshufb xmm13,XMMWORD[$L$rol8]
+ paddd xmm9,xmm13
+ pxor xmm5,xmm9
+ movdqa xmm3,xmm5
+ pslld xmm3,7
+ psrld xmm5,25
+ pxor xmm5,xmm3
+DB 102,15,58,15,237,12
+DB 102,69,15,58,15,201,8
+DB 102,69,15,58,15,237,4
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol16]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,12
+ psrld xmm6,20
+ pxor xmm6,xmm3
+ paddd xmm2,xmm6
+ pxor xmm14,xmm2
+ pshufb xmm14,XMMWORD[$L$rol8]
+ paddd xmm10,xmm14
+ pxor xmm6,xmm10
+ movdqa xmm3,xmm6
+ pslld xmm3,7
+ psrld xmm6,25
+ pxor xmm6,xmm3
+DB 102,15,58,15,246,12
+DB 102,69,15,58,15,210,8
+DB 102,69,15,58,15,246,4
+
+ dec r10
+ jnz NEAR $L$seal_sse_128_rounds
+ paddd xmm0,XMMWORD[$L$chacha20_consts]
+ paddd xmm1,XMMWORD[$L$chacha20_consts]
+ paddd xmm2,XMMWORD[$L$chacha20_consts]
+ paddd xmm4,xmm7
+ paddd xmm5,xmm7
+ paddd xmm6,xmm7
+ paddd xmm8,xmm11
+ paddd xmm9,xmm11
+ paddd xmm12,xmm15
+ paddd xmm15,XMMWORD[$L$sse_inc]
+ paddd xmm13,xmm15
+
+ pand xmm2,XMMWORD[$L$clamp]
+ movdqa XMMWORD[(160+0)+rbp],xmm2
+ movdqa XMMWORD[(160+16)+rbp],xmm6
+
+ mov r8,r8
+ call poly_hash_ad_internal
+ jmp NEAR $L$seal_sse_128_tail_xor
+$L$SEH_end_chacha20_poly1305_seal:
+
+
+
+
+ALIGN 64
+chacha20_poly1305_open_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+ vzeroupper
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vbroadcasti128 ymm4,XMMWORD[r9]
+ vbroadcasti128 ymm8,XMMWORD[16+r9]
+ vbroadcasti128 ymm12,XMMWORD[32+r9]
+ vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init]
+ cmp rbx,6*32
+ jbe NEAR $L$open_avx2_192
+ cmp rbx,10*32
+ jbe NEAR $L$open_avx2_320
+
+ vmovdqa YMMWORD[(160+64)+rbp],ymm4
+ vmovdqa YMMWORD[(160+96)+rbp],ymm8
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ mov r10,10
+$L$open_avx2_init_rounds:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+
+ dec r10
+ jne NEAR $L$open_avx2_init_rounds
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+ vperm2i128 ymm3,ymm4,ymm0,0x02
+
+ vpand ymm3,ymm3,YMMWORD[$L$clamp]
+ vmovdqa YMMWORD[(160+0)+rbp],ymm3
+
+ vperm2i128 ymm0,ymm4,ymm0,0x13
+ vperm2i128 ymm4,ymm12,ymm8,0x13
+
+ mov r8,r8
+ call poly_hash_ad_internal
+
+ xor rcx,rcx
+$L$open_avx2_init_hash:
+ add r10,QWORD[((0+0))+rcx*1+rsi]
+ adc r11,QWORD[((8+0))+rcx*1+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ add rcx,16
+ cmp rcx,2*32
+ jne NEAR $L$open_avx2_init_hash
+
+ vpxor ymm0,ymm0,YMMWORD[rsi]
+ vpxor ymm4,ymm4,YMMWORD[32+rsi]
+
+ vmovdqu YMMWORD[rdi],ymm0
+ vmovdqu YMMWORD[32+rdi],ymm4
+ lea rsi,[64+rsi]
+ lea rdi,[64+rdi]
+ sub rbx,2*32
+$L$open_avx2_main_loop:
+
+ cmp rbx,16*32
+ jb NEAR $L$open_avx2_main_loop_done
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm3,ymm0
+ vmovdqa ymm7,ymm4
+ vmovdqa ymm11,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm14,ymm12,ymm15
+ vpaddd ymm13,ymm12,ymm14
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+256)+rbp],ymm15
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+
+ xor rcx,rcx
+$L$open_avx2_main_loop_rounds:
+ add r10,QWORD[((0+0))+rcx*1+rsi]
+ adc r11,QWORD[((8+0))+rcx*1+rsi]
+ adc r12,1
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ add r15,rax
+ adc r9,rdx
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ add r10,QWORD[((0+16))+rcx*1+rsi]
+ adc r11,QWORD[((8+16))+rcx*1+rsi]
+ adc r12,1
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,4
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,12
+ vpalignr ymm6,ymm6,ymm6,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,12
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpalignr ymm5,ymm5,ymm5,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm4,ymm4,ymm4,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,12
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ add r15,rax
+ adc r9,rdx
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ add r10,QWORD[((0+32))+rcx*1+rsi]
+ adc r11,QWORD[((8+32))+rcx*1+rsi]
+ adc r12,1
+
+ lea rcx,[48+rcx]
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ add r15,rax
+ adc r9,rdx
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,12
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,4
+ vpalignr ymm6,ymm6,ymm6,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm5,ymm5,ymm5,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm4,ymm4,ymm4,12
+ vpalignr ymm8,ymm8,ymm8,8
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpalignr ymm12,ymm12,ymm12,4
+
+ cmp rcx,10*6*8
+ jne NEAR $L$open_avx2_main_loop_rounds
+ vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
+ vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
+ vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
+ vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+ vmovdqa YMMWORD[(160+128)+rbp],ymm0
+ add r10,QWORD[((0+480))+rsi]
+ adc r11,QWORD[((8+480))+rsi]
+ adc r12,1
+ vperm2i128 ymm0,ymm7,ymm3,0x02
+ vperm2i128 ymm7,ymm7,ymm3,0x13
+ vperm2i128 ymm3,ymm15,ymm11,0x02
+ vperm2i128 ymm11,ymm15,ymm11,0x13
+ vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
+ vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
+ vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
+ vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm0
+ vmovdqu YMMWORD[(32+0)+rdi],ymm3
+ vmovdqu YMMWORD[(64+0)+rdi],ymm7
+ vmovdqu YMMWORD[(96+0)+rdi],ymm11
+
+ vmovdqa ymm0,YMMWORD[((160+128))+rbp]
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vperm2i128 ymm3,ymm6,ymm2,0x02
+ vperm2i128 ymm6,ymm6,ymm2,0x13
+ vperm2i128 ymm2,ymm14,ymm10,0x02
+ vperm2i128 ymm10,ymm14,ymm10,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
+ vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
+ vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
+ vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
+ vmovdqu YMMWORD[(0+128)+rdi],ymm3
+ vmovdqu YMMWORD[(32+128)+rdi],ymm2
+ vmovdqu YMMWORD[(64+128)+rdi],ymm6
+ vmovdqu YMMWORD[(96+128)+rdi],ymm10
+ add r10,QWORD[((0+480+16))+rsi]
+ adc r11,QWORD[((8+480+16))+rsi]
+ adc r12,1
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
+ vmovdqu YMMWORD[(0+256)+rdi],ymm3
+ vmovdqu YMMWORD[(32+256)+rdi],ymm1
+ vmovdqu YMMWORD[(64+256)+rdi],ymm5
+ vmovdqu YMMWORD[(96+256)+rdi],ymm9
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vperm2i128 ymm3,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm12,ymm8,0x02
+ vperm2i128 ymm8,ymm12,ymm8,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi]
+ vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi]
+ vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi]
+ vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi]
+ vmovdqu YMMWORD[(0+384)+rdi],ymm3
+ vmovdqu YMMWORD[(32+384)+rdi],ymm0
+ vmovdqu YMMWORD[(64+384)+rdi],ymm4
+ vmovdqu YMMWORD[(96+384)+rdi],ymm8
+
+ lea rsi,[512+rsi]
+ lea rdi,[512+rdi]
+ sub rbx,16*32
+ jmp NEAR $L$open_avx2_main_loop
+$L$open_avx2_main_loop_done:
+ test rbx,rbx
+ vzeroupper
+ je NEAR $L$open_sse_finalize
+
+ cmp rbx,12*32
+ ja NEAR $L$open_avx2_tail_512
+ cmp rbx,8*32
+ ja NEAR $L$open_avx2_tail_384
+ cmp rbx,4*32
+ ja NEAR $L$open_avx2_tail_256
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+
+ xor r8,r8
+ mov rcx,rbx
+ and rcx,-16
+ test rcx,rcx
+ je NEAR $L$open_avx2_tail_128_rounds
+$L$open_avx2_tail_128_rounds_and_x1hash:
+ add r10,QWORD[((0+0))+r8*1+rsi]
+ adc r11,QWORD[((8+0))+r8*1+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+$L$open_avx2_tail_128_rounds:
+ add r8,16
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+
+ cmp r8,rcx
+ jb NEAR $L$open_avx2_tail_128_rounds_and_x1hash
+ cmp r8,160
+ jne NEAR $L$open_avx2_tail_128_rounds
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ jmp NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_256:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+
+ mov QWORD[((160+128))+rbp],rbx
+ mov rcx,rbx
+ sub rcx,4*32
+ shr rcx,4
+ mov r8,10
+ cmp rcx,10
+ cmovg rcx,r8
+ mov rbx,rsi
+ xor r8,r8
+$L$open_avx2_tail_256_rounds_and_x1hash:
+ add r10,QWORD[((0+0))+rbx]
+ adc r11,QWORD[((8+0))+rbx]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rbx,[16+rbx]
+$L$open_avx2_tail_256_rounds:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+
+ inc r8
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,12
+
+ cmp r8,rcx
+ jb NEAR $L$open_avx2_tail_256_rounds_and_x1hash
+ cmp r8,10
+ jne NEAR $L$open_avx2_tail_256_rounds
+ mov r8,rbx
+ sub rbx,rsi
+ mov rcx,rbx
+ mov rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_tail_256_hash:
+ add rcx,16
+ cmp rcx,rbx
+ jg NEAR $L$open_avx2_tail_256_done
+ add r10,QWORD[((0+0))+r8]
+ adc r11,QWORD[((8+0))+r8]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea r8,[16+r8]
+ jmp NEAR $L$open_avx2_tail_256_hash
+$L$open_avx2_tail_256_done:
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm3
+ vmovdqu YMMWORD[(32+0)+rdi],ymm1
+ vmovdqu YMMWORD[(64+0)+rdi],ymm5
+ vmovdqu YMMWORD[(96+0)+rdi],ymm9
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ lea rsi,[128+rsi]
+ lea rdi,[128+rdi]
+ sub rbx,4*32
+ jmp NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_384:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm13,ymm12,ymm14
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+
+ mov QWORD[((160+128))+rbp],rbx
+ mov rcx,rbx
+ sub rcx,8*32
+ shr rcx,4
+ add rcx,6
+ mov r8,10
+ cmp rcx,10
+ cmovg rcx,r8
+ mov rbx,rsi
+ xor r8,r8
+$L$open_avx2_tail_384_rounds_and_x2hash:
+ add r10,QWORD[((0+0))+rbx]
+ adc r11,QWORD[((8+0))+rbx]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rbx,[16+rbx]
+$L$open_avx2_tail_384_rounds_and_x1hash:
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ add r10,QWORD[((0+0))+rbx]
+ adc r11,QWORD[((8+0))+rbx]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rbx,[16+rbx]
+ inc r8
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,12
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+
+ cmp r8,rcx
+ jb NEAR $L$open_avx2_tail_384_rounds_and_x2hash
+ cmp r8,10
+ jne NEAR $L$open_avx2_tail_384_rounds_and_x1hash
+ mov r8,rbx
+ sub rbx,rsi
+ mov rcx,rbx
+ mov rbx,QWORD[((160+128))+rbp]
+$L$open_avx2_384_tail_hash:
+ add rcx,16
+ cmp rcx,rbx
+ jg NEAR $L$open_avx2_384_tail_done
+ add r10,QWORD[((0+0))+r8]
+ adc r11,QWORD[((8+0))+r8]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea r8,[16+r8]
+ jmp NEAR $L$open_avx2_384_tail_hash
+$L$open_avx2_384_tail_done:
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
+ vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vperm2i128 ymm3,ymm6,ymm2,0x02
+ vperm2i128 ymm6,ymm6,ymm2,0x13
+ vperm2i128 ymm2,ymm14,ymm10,0x02
+ vperm2i128 ymm10,ymm14,ymm10,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
+ vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi]
+ vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi]
+ vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm3
+ vmovdqu YMMWORD[(32+0)+rdi],ymm2
+ vmovdqu YMMWORD[(64+0)+rdi],ymm6
+ vmovdqu YMMWORD[(96+0)+rdi],ymm10
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi]
+ vmovdqu YMMWORD[(0+128)+rdi],ymm3
+ vmovdqu YMMWORD[(32+128)+rdi],ymm1
+ vmovdqu YMMWORD[(64+128)+rdi],ymm5
+ vmovdqu YMMWORD[(96+128)+rdi],ymm9
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ lea rsi,[256+rsi]
+ lea rdi,[256+rdi]
+ sub rbx,8*32
+ jmp NEAR $L$open_avx2_tail_128_xor
+
+$L$open_avx2_tail_512:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm3,ymm0
+ vmovdqa ymm7,ymm4
+ vmovdqa ymm11,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm14,ymm12,ymm15
+ vpaddd ymm13,ymm12,ymm14
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+256)+rbp],ymm15
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+
+ xor rcx,rcx
+ mov r8,rsi
+$L$open_avx2_tail_512_rounds_and_x2hash:
+ add r10,QWORD[((0+0))+r8]
+ adc r11,QWORD[((8+0))+r8]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea r8,[16+r8]
+$L$open_avx2_tail_512_rounds_and_x1hash:
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ add r10,QWORD[((0+0))+r8]
+ adc r11,QWORD[((8+0))+r8]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,4
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,12
+ vpalignr ymm6,ymm6,ymm6,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm5,ymm5,ymm5,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm4,ymm4,ymm4,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,12
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ add r10,QWORD[((0+16))+r8]
+ adc r11,QWORD[((8+16))+r8]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea r8,[32+r8]
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,12
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,4
+ vpalignr ymm6,ymm6,ymm6,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm5,ymm5,ymm5,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm4,ymm4,ymm4,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,4
+
+ inc rcx
+ cmp rcx,4
+ jl NEAR $L$open_avx2_tail_512_rounds_and_x2hash
+ cmp rcx,10
+ jne NEAR $L$open_avx2_tail_512_rounds_and_x1hash
+ mov rcx,rbx
+ sub rcx,12*32
+ and rcx,-16
+$L$open_avx2_tail_512_hash:
+ test rcx,rcx
+ je NEAR $L$open_avx2_tail_512_done
+ add r10,QWORD[((0+0))+r8]
+ adc r11,QWORD[((8+0))+r8]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea r8,[16+r8]
+ sub rcx,2*8
+ jmp NEAR $L$open_avx2_tail_512_hash
+$L$open_avx2_tail_512_done:
+ vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
+ vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
+ vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
+ vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+ vmovdqa YMMWORD[(160+128)+rbp],ymm0
+ vperm2i128 ymm0,ymm7,ymm3,0x02
+ vperm2i128 ymm7,ymm7,ymm3,0x13
+ vperm2i128 ymm3,ymm15,ymm11,0x02
+ vperm2i128 ymm11,ymm15,ymm11,0x13
+ vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
+ vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
+ vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
+ vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm0
+ vmovdqu YMMWORD[(32+0)+rdi],ymm3
+ vmovdqu YMMWORD[(64+0)+rdi],ymm7
+ vmovdqu YMMWORD[(96+0)+rdi],ymm11
+
+ vmovdqa ymm0,YMMWORD[((160+128))+rbp]
+ vperm2i128 ymm3,ymm6,ymm2,0x02
+ vperm2i128 ymm6,ymm6,ymm2,0x13
+ vperm2i128 ymm2,ymm14,ymm10,0x02
+ vperm2i128 ymm10,ymm14,ymm10,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
+ vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
+ vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
+ vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
+ vmovdqu YMMWORD[(0+128)+rdi],ymm3
+ vmovdqu YMMWORD[(32+128)+rdi],ymm2
+ vmovdqu YMMWORD[(64+128)+rdi],ymm6
+ vmovdqu YMMWORD[(96+128)+rdi],ymm10
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
+ vmovdqu YMMWORD[(0+256)+rdi],ymm3
+ vmovdqu YMMWORD[(32+256)+rdi],ymm1
+ vmovdqu YMMWORD[(64+256)+rdi],ymm5
+ vmovdqu YMMWORD[(96+256)+rdi],ymm9
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ lea rsi,[384+rsi]
+ lea rdi,[384+rdi]
+ sub rbx,12*32
+$L$open_avx2_tail_128_xor:
+ cmp rbx,32
+ jb NEAR $L$open_avx2_tail_32_xor
+ sub rbx,32
+ vpxor ymm0,ymm0,YMMWORD[rsi]
+ vmovdqu YMMWORD[rdi],ymm0
+ lea rsi,[32+rsi]
+ lea rdi,[32+rdi]
+ vmovdqa ymm0,ymm4
+ vmovdqa ymm4,ymm8
+ vmovdqa ymm8,ymm12
+ jmp NEAR $L$open_avx2_tail_128_xor
+$L$open_avx2_tail_32_xor:
+ cmp rbx,16
+ vmovdqa xmm1,xmm0
+ jb NEAR $L$open_avx2_exit
+ sub rbx,16
+
+ vpxor xmm1,xmm0,XMMWORD[rsi]
+ vmovdqu XMMWORD[rdi],xmm1
+ lea rsi,[16+rsi]
+ lea rdi,[16+rdi]
+ vperm2i128 ymm0,ymm0,ymm0,0x11
+ vmovdqa xmm1,xmm0
+$L$open_avx2_exit:
+ vzeroupper
+ jmp NEAR $L$open_sse_tail_16
+
+$L$open_avx2_192:
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm10,ymm8
+ vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
+ vmovdqa ymm11,ymm12
+ vmovdqa ymm15,ymm13
+ mov r10,10
+$L$open_avx2_192_rounds:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+
+ dec r10
+ jne NEAR $L$open_avx2_192_rounds
+ vpaddd ymm0,ymm0,ymm2
+ vpaddd ymm1,ymm1,ymm2
+ vpaddd ymm4,ymm4,ymm6
+ vpaddd ymm5,ymm5,ymm6
+ vpaddd ymm8,ymm8,ymm10
+ vpaddd ymm9,ymm9,ymm10
+ vpaddd ymm12,ymm12,ymm11
+ vpaddd ymm13,ymm13,ymm15
+ vperm2i128 ymm3,ymm4,ymm0,0x02
+
+ vpand ymm3,ymm3,YMMWORD[$L$clamp]
+ vmovdqa YMMWORD[(160+0)+rbp],ymm3
+
+ vperm2i128 ymm0,ymm4,ymm0,0x13
+ vperm2i128 ymm4,ymm12,ymm8,0x13
+ vperm2i128 ymm8,ymm5,ymm1,0x02
+ vperm2i128 ymm12,ymm13,ymm9,0x02
+ vperm2i128 ymm1,ymm5,ymm1,0x13
+ vperm2i128 ymm5,ymm13,ymm9,0x13
+$L$open_avx2_short:
+ mov r8,r8
+ call poly_hash_ad_internal
+$L$open_avx2_short_hash_and_xor_loop:
+ cmp rbx,32
+ jb NEAR $L$open_avx2_short_tail_32
+ sub rbx,32
+ add r10,QWORD[((0+0))+rsi]
+ adc r11,QWORD[((8+0))+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ add r10,QWORD[((0+16))+rsi]
+ adc r11,QWORD[((8+16))+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+
+ vpxor ymm0,ymm0,YMMWORD[rsi]
+ vmovdqu YMMWORD[rdi],ymm0
+ lea rsi,[32+rsi]
+ lea rdi,[32+rdi]
+
+ vmovdqa ymm0,ymm4
+ vmovdqa ymm4,ymm8
+ vmovdqa ymm8,ymm12
+ vmovdqa ymm12,ymm1
+ vmovdqa ymm1,ymm5
+ vmovdqa ymm5,ymm9
+ vmovdqa ymm9,ymm13
+ vmovdqa ymm13,ymm2
+ vmovdqa ymm2,ymm6
+ jmp NEAR $L$open_avx2_short_hash_and_xor_loop
+$L$open_avx2_short_tail_32:
+ cmp rbx,16
+ vmovdqa xmm1,xmm0
+ jb NEAR $L$open_avx2_short_tail_32_exit
+ sub rbx,16
+ add r10,QWORD[((0+0))+rsi]
+ adc r11,QWORD[((8+0))+rsi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ vpxor xmm3,xmm0,XMMWORD[rsi]
+ vmovdqu XMMWORD[rdi],xmm3
+ lea rsi,[16+rsi]
+ lea rdi,[16+rdi]
+ vextracti128 xmm1,ymm0,1
+$L$open_avx2_short_tail_32_exit:
+ vzeroupper
+ jmp NEAR $L$open_sse_tail_16
+
+$L$open_avx2_320:
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm10,ymm8
+ vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc]
+ vmovdqa ymm7,ymm4
+ vmovdqa ymm11,ymm8
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ mov r10,10
+$L$open_avx2_320_rounds:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,12
+
+ dec r10
+ jne NEAR $L$open_avx2_320_rounds
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,ymm7
+ vpaddd ymm5,ymm5,ymm7
+ vpaddd ymm6,ymm6,ymm7
+ vpaddd ymm8,ymm8,ymm11
+ vpaddd ymm9,ymm9,ymm11
+ vpaddd ymm10,ymm10,ymm11
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vperm2i128 ymm3,ymm4,ymm0,0x02
+
+ vpand ymm3,ymm3,YMMWORD[$L$clamp]
+ vmovdqa YMMWORD[(160+0)+rbp],ymm3
+
+ vperm2i128 ymm0,ymm4,ymm0,0x13
+ vperm2i128 ymm4,ymm12,ymm8,0x13
+ vperm2i128 ymm8,ymm5,ymm1,0x02
+ vperm2i128 ymm12,ymm13,ymm9,0x02
+ vperm2i128 ymm1,ymm5,ymm1,0x13
+ vperm2i128 ymm5,ymm13,ymm9,0x13
+ vperm2i128 ymm9,ymm6,ymm2,0x02
+ vperm2i128 ymm13,ymm14,ymm10,0x02
+ vperm2i128 ymm2,ymm6,ymm2,0x13
+ vperm2i128 ymm6,ymm14,ymm10,0x13
+ jmp NEAR $L$open_avx2_short
+
+
+
+
+
+ALIGN 64
+chacha20_poly1305_seal_avx2:
+
+
+
+
+
+
+
+
+
+
+
+
+ vzeroupper
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vbroadcasti128 ymm4,XMMWORD[r9]
+ vbroadcasti128 ymm8,XMMWORD[16+r9]
+ vbroadcasti128 ymm12,XMMWORD[32+r9]
+ vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init]
+ cmp rbx,6*32
+ jbe NEAR $L$seal_avx2_192
+ cmp rbx,10*32
+ jbe NEAR $L$seal_avx2_320
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm3,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm7,ymm4
+ vmovdqa YMMWORD[(160+64)+rbp],ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm11,ymm8
+ vmovdqa YMMWORD[(160+96)+rbp],ymm8
+ vmovdqa ymm15,ymm12
+ vpaddd ymm14,ymm15,YMMWORD[$L$avx2_inc]
+ vpaddd ymm13,ymm14,YMMWORD[$L$avx2_inc]
+ vpaddd ymm12,ymm13,YMMWORD[$L$avx2_inc]
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ vmovdqa YMMWORD[(160+256)+rbp],ymm15
+ mov r10,10
+$L$seal_avx2_init_rounds:
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,4
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,12
+ vpalignr ymm6,ymm6,ymm6,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm5,ymm5,ymm5,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm4,ymm4,ymm4,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,12
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,12
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,4
+ vpalignr ymm6,ymm6,ymm6,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm5,ymm5,ymm5,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm4,ymm4,ymm4,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,4
+
+ dec r10
+ jnz NEAR $L$seal_avx2_init_rounds
+ vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
+ vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
+ vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
+ vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+ vperm2i128 ymm11,ymm15,ymm11,0x13
+ vperm2i128 ymm15,ymm7,ymm3,0x02
+ vperm2i128 ymm3,ymm7,ymm3,0x13
+ vpand ymm15,ymm15,YMMWORD[$L$clamp]
+ vmovdqa YMMWORD[(160+0)+rbp],ymm15
+ mov r8,r8
+ call poly_hash_ad_internal
+
+ vpxor ymm3,ymm3,YMMWORD[rsi]
+ vpxor ymm11,ymm11,YMMWORD[32+rsi]
+ vmovdqu YMMWORD[rdi],ymm3
+ vmovdqu YMMWORD[32+rdi],ymm11
+ vperm2i128 ymm15,ymm6,ymm2,0x02
+ vperm2i128 ymm6,ymm6,ymm2,0x13
+ vperm2i128 ymm2,ymm14,ymm10,0x02
+ vperm2i128 ymm10,ymm14,ymm10,0x13
+ vpxor ymm15,ymm15,YMMWORD[((0+64))+rsi]
+ vpxor ymm2,ymm2,YMMWORD[((32+64))+rsi]
+ vpxor ymm6,ymm6,YMMWORD[((64+64))+rsi]
+ vpxor ymm10,ymm10,YMMWORD[((96+64))+rsi]
+ vmovdqu YMMWORD[(0+64)+rdi],ymm15
+ vmovdqu YMMWORD[(32+64)+rdi],ymm2
+ vmovdqu YMMWORD[(64+64)+rdi],ymm6
+ vmovdqu YMMWORD[(96+64)+rdi],ymm10
+ vperm2i128 ymm15,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm15,ymm15,YMMWORD[((0+192))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+192))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+192))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+192))+rsi]
+ vmovdqu YMMWORD[(0+192)+rdi],ymm15
+ vmovdqu YMMWORD[(32+192)+rdi],ymm1
+ vmovdqu YMMWORD[(64+192)+rdi],ymm5
+ vmovdqu YMMWORD[(96+192)+rdi],ymm9
+ vperm2i128 ymm15,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm15
+
+ lea rsi,[320+rsi]
+ sub rbx,10*32
+ mov rcx,10*32
+ cmp rbx,4*32
+ jbe NEAR $L$seal_avx2_short_hash_remainder
+ vpxor ymm0,ymm0,YMMWORD[rsi]
+ vpxor ymm4,ymm4,YMMWORD[32+rsi]
+ vpxor ymm8,ymm8,YMMWORD[64+rsi]
+ vpxor ymm12,ymm12,YMMWORD[96+rsi]
+ vmovdqu YMMWORD[320+rdi],ymm0
+ vmovdqu YMMWORD[352+rdi],ymm4
+ vmovdqu YMMWORD[384+rdi],ymm8
+ vmovdqu YMMWORD[416+rdi],ymm12
+ lea rsi,[128+rsi]
+ sub rbx,4*32
+ mov rcx,8
+ mov r8,2
+ cmp rbx,4*32
+ jbe NEAR $L$seal_avx2_tail_128
+ cmp rbx,8*32
+ jbe NEAR $L$seal_avx2_tail_256
+ cmp rbx,12*32
+ jbe NEAR $L$seal_avx2_tail_384
+ cmp rbx,16*32
+ jbe NEAR $L$seal_avx2_tail_512
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm3,ymm0
+ vmovdqa ymm7,ymm4
+ vmovdqa ymm11,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm14,ymm12,ymm15
+ vpaddd ymm13,ymm12,ymm14
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+256)+rbp],ymm15
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,4
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,12
+ vpalignr ymm6,ymm6,ymm6,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm5,ymm5,ymm5,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm4,ymm4,ymm4,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,12
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,12
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,4
+ vpalignr ymm6,ymm6,ymm6,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm5,ymm5,ymm5,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm4,ymm4,ymm4,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,4
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+
+ sub rdi,16
+ mov rcx,9
+ jmp NEAR $L$seal_avx2_main_loop_rounds_entry
+ALIGN 32
+$L$seal_avx2_main_loop:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm3,ymm0
+ vmovdqa ymm7,ymm4
+ vmovdqa ymm11,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm14,ymm12,ymm15
+ vpaddd ymm13,ymm12,ymm14
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+256)+rbp],ymm15
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+
+ mov rcx,10
+ALIGN 32
+$L$seal_avx2_main_loop_rounds:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ add r15,rax
+ adc r9,rdx
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+$L$seal_avx2_main_loop_rounds_entry:
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,4
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,12
+ vpalignr ymm6,ymm6,ymm6,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,12
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpalignr ymm5,ymm5,ymm5,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm4,ymm4,ymm4,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,12
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ add r15,rax
+ adc r9,rdx
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ add r10,QWORD[((0+32))+rdi]
+ adc r11,QWORD[((8+32))+rdi]
+ adc r12,1
+
+ lea rdi,[48+rdi]
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ add r15,rax
+ adc r9,rdx
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,12
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,4
+ vpalignr ymm6,ymm6,ymm6,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm5,ymm5,ymm5,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm4,ymm4,ymm4,12
+ vpalignr ymm8,ymm8,ymm8,8
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpalignr ymm12,ymm12,ymm12,4
+
+ dec rcx
+ jne NEAR $L$seal_avx2_main_loop_rounds
+ vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
+ vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
+ vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
+ vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+ vmovdqa YMMWORD[(160+128)+rbp],ymm0
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[32+rdi]
+ vperm2i128 ymm0,ymm7,ymm3,0x02
+ vperm2i128 ymm7,ymm7,ymm3,0x13
+ vperm2i128 ymm3,ymm15,ymm11,0x02
+ vperm2i128 ymm11,ymm15,ymm11,0x13
+ vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
+ vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
+ vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
+ vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm0
+ vmovdqu YMMWORD[(32+0)+rdi],ymm3
+ vmovdqu YMMWORD[(64+0)+rdi],ymm7
+ vmovdqu YMMWORD[(96+0)+rdi],ymm11
+
+ vmovdqa ymm0,YMMWORD[((160+128))+rbp]
+ vperm2i128 ymm3,ymm6,ymm2,0x02
+ vperm2i128 ymm6,ymm6,ymm2,0x13
+ vperm2i128 ymm2,ymm14,ymm10,0x02
+ vperm2i128 ymm10,ymm14,ymm10,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
+ vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
+ vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
+ vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
+ vmovdqu YMMWORD[(0+128)+rdi],ymm3
+ vmovdqu YMMWORD[(32+128)+rdi],ymm2
+ vmovdqu YMMWORD[(64+128)+rdi],ymm6
+ vmovdqu YMMWORD[(96+128)+rdi],ymm10
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
+ vmovdqu YMMWORD[(0+256)+rdi],ymm3
+ vmovdqu YMMWORD[(32+256)+rdi],ymm1
+ vmovdqu YMMWORD[(64+256)+rdi],ymm5
+ vmovdqu YMMWORD[(96+256)+rdi],ymm9
+ vperm2i128 ymm3,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm12,ymm8,0x02
+ vperm2i128 ymm8,ymm12,ymm8,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi]
+ vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi]
+ vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi]
+ vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi]
+ vmovdqu YMMWORD[(0+384)+rdi],ymm3
+ vmovdqu YMMWORD[(32+384)+rdi],ymm0
+ vmovdqu YMMWORD[(64+384)+rdi],ymm4
+ vmovdqu YMMWORD[(96+384)+rdi],ymm8
+
+ lea rsi,[512+rsi]
+ sub rbx,16*32
+ cmp rbx,16*32
+ jg NEAR $L$seal_avx2_main_loop
+
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[32+rdi]
+ mov rcx,10
+ xor r8,r8
+
+ cmp rbx,12*32
+ ja NEAR $L$seal_avx2_tail_512
+ cmp rbx,8*32
+ ja NEAR $L$seal_avx2_tail_384
+ cmp rbx,4*32
+ ja NEAR $L$seal_avx2_tail_256
+
+$L$seal_avx2_tail_128:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_128_rounds_and_3xhash:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+$L$seal_avx2_tail_128_rounds_and_2xhash:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[32+rdi]
+ dec rcx
+ jg NEAR $L$seal_avx2_tail_128_rounds_and_3xhash
+ dec r8
+ jge NEAR $L$seal_avx2_tail_128_rounds_and_2xhash
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ jmp NEAR $L$seal_avx2_short_loop
+
+$L$seal_avx2_tail_256:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+
+$L$seal_avx2_tail_256_rounds_and_3xhash:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+$L$seal_avx2_tail_256_rounds_and_2xhash:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[32+rdi]
+ dec rcx
+ jg NEAR $L$seal_avx2_tail_256_rounds_and_3xhash
+ dec r8
+ jge NEAR $L$seal_avx2_tail_256_rounds_and_2xhash
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm3
+ vmovdqu YMMWORD[(32+0)+rdi],ymm1
+ vmovdqu YMMWORD[(64+0)+rdi],ymm5
+ vmovdqu YMMWORD[(96+0)+rdi],ymm9
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ mov rcx,4*32
+ lea rsi,[128+rsi]
+ sub rbx,4*32
+ jmp NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_384:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm13,ymm12,ymm14
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+
+$L$seal_avx2_tail_384_rounds_and_3xhash:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+$L$seal_avx2_tail_384_rounds_and_2xhash:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,12
+
+ lea rdi,[32+rdi]
+ dec rcx
+ jg NEAR $L$seal_avx2_tail_384_rounds_and_3xhash
+ dec r8
+ jge NEAR $L$seal_avx2_tail_384_rounds_and_2xhash
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
+ vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vperm2i128 ymm3,ymm6,ymm2,0x02
+ vperm2i128 ymm6,ymm6,ymm2,0x13
+ vperm2i128 ymm2,ymm14,ymm10,0x02
+ vperm2i128 ymm10,ymm14,ymm10,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi]
+ vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi]
+ vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi]
+ vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm3
+ vmovdqu YMMWORD[(32+0)+rdi],ymm2
+ vmovdqu YMMWORD[(64+0)+rdi],ymm6
+ vmovdqu YMMWORD[(96+0)+rdi],ymm10
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi]
+ vmovdqu YMMWORD[(0+128)+rdi],ymm3
+ vmovdqu YMMWORD[(32+128)+rdi],ymm1
+ vmovdqu YMMWORD[(64+128)+rdi],ymm5
+ vmovdqu YMMWORD[(96+128)+rdi],ymm9
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ mov rcx,8*32
+ lea rsi,[256+rsi]
+ sub rbx,8*32
+ jmp NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_tail_512:
+ vmovdqa ymm0,YMMWORD[$L$chacha20_consts]
+ vmovdqa ymm4,YMMWORD[((160+64))+rbp]
+ vmovdqa ymm8,YMMWORD[((160+96))+rbp]
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm10,ymm8
+ vmovdqa ymm3,ymm0
+ vmovdqa ymm7,ymm4
+ vmovdqa ymm11,ymm8
+ vmovdqa ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm14,ymm12,ymm15
+ vpaddd ymm13,ymm12,ymm14
+ vpaddd ymm12,ymm12,ymm13
+ vmovdqa YMMWORD[(160+256)+rbp],ymm15
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+
+$L$seal_avx2_tail_512_rounds_and_3xhash:
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ add r15,rax
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+$L$seal_avx2_tail_512_rounds_and_2xhash:
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,4
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,12
+ vpalignr ymm6,ymm6,ymm6,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm5,ymm5,ymm5,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm4,ymm4,ymm4,4
+ add r15,rax
+ adc r9,rdx
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,12
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol16]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,20
+ vpslld ymm7,ymm7,32-20
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,20
+ vpslld ymm6,ymm6,32-20
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,20
+ vpslld ymm5,ymm5,32-20
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,20
+ vpslld ymm4,ymm4,32-20
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[$L$rol8]
+ vpaddd ymm3,ymm3,ymm7
+ vpaddd ymm2,ymm2,ymm6
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ vpaddd ymm1,ymm1,ymm5
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm15,ymm15,ymm3
+ vpxor ymm14,ymm14,ymm2
+ vpxor ymm13,ymm13,ymm1
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm15,ymm15,ymm8
+ vpshufb ymm14,ymm14,ymm8
+ vpshufb ymm13,ymm13,ymm8
+ vpshufb ymm12,ymm12,ymm8
+ vpaddd ymm11,ymm11,ymm15
+ vpaddd ymm10,ymm10,ymm14
+ vpaddd ymm9,ymm9,ymm13
+ vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp]
+ vpxor ymm7,ymm7,ymm11
+ vpxor ymm6,ymm6,ymm10
+ vpxor ymm5,ymm5,ymm9
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa YMMWORD[(160+128)+rbp],ymm8
+ vpsrld ymm8,ymm7,25
+ mov rdx,QWORD[((0+160+0))+rbp]
+ mov r15,rdx
+ mulx r14,r13,r10
+ mulx rdx,rax,r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ vpslld ymm7,ymm7,32-25
+ vpxor ymm7,ymm7,ymm8
+ vpsrld ymm8,ymm6,25
+ vpslld ymm6,ymm6,32-25
+ vpxor ymm6,ymm6,ymm8
+ vpsrld ymm8,ymm5,25
+ vpslld ymm5,ymm5,32-25
+ vpxor ymm5,ymm5,ymm8
+ vpsrld ymm8,ymm4,25
+ vpslld ymm4,ymm4,32-25
+ vpxor ymm4,ymm4,ymm8
+ vmovdqa ymm8,YMMWORD[((160+128))+rbp]
+ vpalignr ymm7,ymm7,ymm7,12
+ vpalignr ymm11,ymm11,ymm11,8
+ vpalignr ymm15,ymm15,ymm15,4
+ vpalignr ymm6,ymm6,ymm6,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm5,ymm5,ymm5,12
+ vpalignr ymm9,ymm9,ymm9,8
+ mov rdx,QWORD[((8+160+0))+rbp]
+ mulx rax,r10,r10
+ add r14,r10
+ mulx r9,r11,r11
+ adc r15,r11
+ adc r9,0
+ imul rdx,r12
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm4,ymm4,ymm4,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm12,ymm12,ymm12,4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ add r15,rax
+ adc r9,rdx
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[32+rdi]
+ dec rcx
+ jg NEAR $L$seal_avx2_tail_512_rounds_and_3xhash
+ dec r8
+ jge NEAR $L$seal_avx2_tail_512_rounds_and_2xhash
+ vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp]
+ vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp]
+ vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp]
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp]
+ vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp]
+ vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp]
+ vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp]
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+
+ vmovdqa YMMWORD[(160+128)+rbp],ymm0
+ vperm2i128 ymm0,ymm7,ymm3,0x02
+ vperm2i128 ymm7,ymm7,ymm3,0x13
+ vperm2i128 ymm3,ymm15,ymm11,0x02
+ vperm2i128 ymm11,ymm15,ymm11,0x13
+ vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi]
+ vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi]
+ vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi]
+ vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi]
+ vmovdqu YMMWORD[(0+0)+rdi],ymm0
+ vmovdqu YMMWORD[(32+0)+rdi],ymm3
+ vmovdqu YMMWORD[(64+0)+rdi],ymm7
+ vmovdqu YMMWORD[(96+0)+rdi],ymm11
+
+ vmovdqa ymm0,YMMWORD[((160+128))+rbp]
+ vperm2i128 ymm3,ymm6,ymm2,0x02
+ vperm2i128 ymm6,ymm6,ymm2,0x13
+ vperm2i128 ymm2,ymm14,ymm10,0x02
+ vperm2i128 ymm10,ymm14,ymm10,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi]
+ vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi]
+ vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi]
+ vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi]
+ vmovdqu YMMWORD[(0+128)+rdi],ymm3
+ vmovdqu YMMWORD[(32+128)+rdi],ymm2
+ vmovdqu YMMWORD[(64+128)+rdi],ymm6
+ vmovdqu YMMWORD[(96+128)+rdi],ymm10
+ vperm2i128 ymm3,ymm5,ymm1,0x02
+ vperm2i128 ymm5,ymm5,ymm1,0x13
+ vperm2i128 ymm1,ymm13,ymm9,0x02
+ vperm2i128 ymm9,ymm13,ymm9,0x13
+ vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi]
+ vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi]
+ vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi]
+ vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi]
+ vmovdqu YMMWORD[(0+256)+rdi],ymm3
+ vmovdqu YMMWORD[(32+256)+rdi],ymm1
+ vmovdqu YMMWORD[(64+256)+rdi],ymm5
+ vmovdqu YMMWORD[(96+256)+rdi],ymm9
+ vperm2i128 ymm3,ymm4,ymm0,0x13
+ vperm2i128 ymm0,ymm4,ymm0,0x02
+ vperm2i128 ymm4,ymm12,ymm8,0x02
+ vperm2i128 ymm12,ymm12,ymm8,0x13
+ vmovdqa ymm8,ymm3
+
+ mov rcx,12*32
+ lea rsi,[384+rsi]
+ sub rbx,12*32
+ jmp NEAR $L$seal_avx2_short_hash_remainder
+
+$L$seal_avx2_320:
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm10,ymm8
+ vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
+ vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc]
+ vmovdqa ymm7,ymm4
+ vmovdqa ymm11,ymm8
+ vmovdqa YMMWORD[(160+160)+rbp],ymm12
+ vmovdqa YMMWORD[(160+192)+rbp],ymm13
+ vmovdqa YMMWORD[(160+224)+rbp],ymm14
+ mov r10,10
+$L$seal_avx2_320_rounds:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,12
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol16]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpsrld ymm3,ymm6,20
+ vpslld ymm6,ymm6,12
+ vpxor ymm6,ymm6,ymm3
+ vpaddd ymm2,ymm2,ymm6
+ vpxor ymm14,ymm14,ymm2
+ vpshufb ymm14,ymm14,YMMWORD[$L$rol8]
+ vpaddd ymm10,ymm10,ymm14
+ vpxor ymm6,ymm6,ymm10
+ vpslld ymm3,ymm6,7
+ vpsrld ymm6,ymm6,25
+ vpxor ymm6,ymm6,ymm3
+ vpalignr ymm14,ymm14,ymm14,4
+ vpalignr ymm10,ymm10,ymm10,8
+ vpalignr ymm6,ymm6,ymm6,12
+
+ dec r10
+ jne NEAR $L$seal_avx2_320_rounds
+ vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts]
+ vpaddd ymm4,ymm4,ymm7
+ vpaddd ymm5,ymm5,ymm7
+ vpaddd ymm6,ymm6,ymm7
+ vpaddd ymm8,ymm8,ymm11
+ vpaddd ymm9,ymm9,ymm11
+ vpaddd ymm10,ymm10,ymm11
+ vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp]
+ vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp]
+ vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp]
+ vperm2i128 ymm3,ymm4,ymm0,0x02
+
+ vpand ymm3,ymm3,YMMWORD[$L$clamp]
+ vmovdqa YMMWORD[(160+0)+rbp],ymm3
+
+ vperm2i128 ymm0,ymm4,ymm0,0x13
+ vperm2i128 ymm4,ymm12,ymm8,0x13
+ vperm2i128 ymm8,ymm5,ymm1,0x02
+ vperm2i128 ymm12,ymm13,ymm9,0x02
+ vperm2i128 ymm1,ymm5,ymm1,0x13
+ vperm2i128 ymm5,ymm13,ymm9,0x13
+ vperm2i128 ymm9,ymm6,ymm2,0x02
+ vperm2i128 ymm13,ymm14,ymm10,0x02
+ vperm2i128 ymm2,ymm6,ymm2,0x13
+ vperm2i128 ymm6,ymm14,ymm10,0x13
+ jmp NEAR $L$seal_avx2_short
+
+$L$seal_avx2_192:
+ vmovdqa ymm1,ymm0
+ vmovdqa ymm2,ymm0
+ vmovdqa ymm5,ymm4
+ vmovdqa ymm6,ymm4
+ vmovdqa ymm9,ymm8
+ vmovdqa ymm10,ymm8
+ vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc]
+ vmovdqa ymm11,ymm12
+ vmovdqa ymm15,ymm13
+ mov r10,10
+$L$seal_avx2_192_rounds:
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,12
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,4
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,12
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,4
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol16]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpsrld ymm3,ymm4,20
+ vpslld ymm4,ymm4,12
+ vpxor ymm4,ymm4,ymm3
+ vpaddd ymm0,ymm0,ymm4
+ vpxor ymm12,ymm12,ymm0
+ vpshufb ymm12,ymm12,YMMWORD[$L$rol8]
+ vpaddd ymm8,ymm8,ymm12
+ vpxor ymm4,ymm4,ymm8
+ vpslld ymm3,ymm4,7
+ vpsrld ymm4,ymm4,25
+ vpxor ymm4,ymm4,ymm3
+ vpalignr ymm12,ymm12,ymm12,4
+ vpalignr ymm8,ymm8,ymm8,8
+ vpalignr ymm4,ymm4,ymm4,12
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol16]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpsrld ymm3,ymm5,20
+ vpslld ymm5,ymm5,12
+ vpxor ymm5,ymm5,ymm3
+ vpaddd ymm1,ymm1,ymm5
+ vpxor ymm13,ymm13,ymm1
+ vpshufb ymm13,ymm13,YMMWORD[$L$rol8]
+ vpaddd ymm9,ymm9,ymm13
+ vpxor ymm5,ymm5,ymm9
+ vpslld ymm3,ymm5,7
+ vpsrld ymm5,ymm5,25
+ vpxor ymm5,ymm5,ymm3
+ vpalignr ymm13,ymm13,ymm13,4
+ vpalignr ymm9,ymm9,ymm9,8
+ vpalignr ymm5,ymm5,ymm5,12
+
+ dec r10
+ jne NEAR $L$seal_avx2_192_rounds
+ vpaddd ymm0,ymm0,ymm2
+ vpaddd ymm1,ymm1,ymm2
+ vpaddd ymm4,ymm4,ymm6
+ vpaddd ymm5,ymm5,ymm6
+ vpaddd ymm8,ymm8,ymm10
+ vpaddd ymm9,ymm9,ymm10
+ vpaddd ymm12,ymm12,ymm11
+ vpaddd ymm13,ymm13,ymm15
+ vperm2i128 ymm3,ymm4,ymm0,0x02
+
+ vpand ymm3,ymm3,YMMWORD[$L$clamp]
+ vmovdqa YMMWORD[(160+0)+rbp],ymm3
+
+ vperm2i128 ymm0,ymm4,ymm0,0x13
+ vperm2i128 ymm4,ymm12,ymm8,0x13
+ vperm2i128 ymm8,ymm5,ymm1,0x02
+ vperm2i128 ymm12,ymm13,ymm9,0x02
+ vperm2i128 ymm1,ymm5,ymm1,0x13
+ vperm2i128 ymm5,ymm13,ymm9,0x13
+$L$seal_avx2_short:
+ mov r8,r8
+ call poly_hash_ad_internal
+ xor rcx,rcx
+$L$seal_avx2_short_hash_remainder:
+ cmp rcx,16
+ jb NEAR $L$seal_avx2_short_loop
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ sub rcx,16
+ add rdi,16
+ jmp NEAR $L$seal_avx2_short_hash_remainder
+$L$seal_avx2_short_loop:
+ cmp rbx,32
+ jb NEAR $L$seal_avx2_short_tail
+ sub rbx,32
+
+ vpxor ymm0,ymm0,YMMWORD[rsi]
+ vmovdqu YMMWORD[rdi],ymm0
+ lea rsi,[32+rsi]
+
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+ add r10,QWORD[((0+16))+rdi]
+ adc r11,QWORD[((8+16))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[32+rdi]
+
+ vmovdqa ymm0,ymm4
+ vmovdqa ymm4,ymm8
+ vmovdqa ymm8,ymm12
+ vmovdqa ymm12,ymm1
+ vmovdqa ymm1,ymm5
+ vmovdqa ymm5,ymm9
+ vmovdqa ymm9,ymm13
+ vmovdqa ymm13,ymm2
+ vmovdqa ymm2,ymm6
+ jmp NEAR $L$seal_avx2_short_loop
+$L$seal_avx2_short_tail:
+ cmp rbx,16
+ jb NEAR $L$seal_avx2_exit
+ sub rbx,16
+ vpxor xmm3,xmm0,XMMWORD[rsi]
+ vmovdqu XMMWORD[rdi],xmm3
+ lea rsi,[16+rsi]
+ add r10,QWORD[((0+0))+rdi]
+ adc r11,QWORD[((8+0))+rdi]
+ adc r12,1
+ mov rax,QWORD[((0+160+0))+rbp]
+ mov r15,rax
+ mul r10
+ mov r13,rax
+ mov r14,rdx
+ mov rax,QWORD[((0+160+0))+rbp]
+ mul r11
+ imul r15,r12
+ add r14,rax
+ adc r15,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mov r9,rax
+ mul r10
+ add r14,rax
+ adc rdx,0
+ mov r10,rdx
+ mov rax,QWORD[((8+160+0))+rbp]
+ mul r11
+ add r15,rax
+ adc rdx,0
+ imul r9,r12
+ add r15,r10
+ adc r9,rdx
+ mov r10,r13
+ mov r11,r14
+ mov r12,r15
+ and r12,3
+ mov r13,r15
+ and r13,-4
+ mov r14,r9
+ shrd r15,r9,2
+ shr r9,2
+ add r15,r13
+ adc r9,r14
+ add r10,r15
+ adc r11,r9
+ adc r12,0
+
+ lea rdi,[16+rdi]
+ vextracti128 xmm0,ymm0,1
+$L$seal_avx2_exit:
+ vzeroupper
+ jmp NEAR $L$seal_sse_tail_16
+
+
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/crypto/err_data.c b/gen/crypto/err_data.c
new file mode 100644
index 0000000..898825a
--- /dev/null
+++ b/gen/crypto/err_data.c
@@ -0,0 +1,1512 @@
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+ /* This file was generated by go run ./util/pregenerate. */
+
+#include <openssl/base.h>
+#include <openssl/err.h>
+
+#include <assert.h>
+
+static_assert(ERR_LIB_NONE == 1, "library value changed");
+static_assert(ERR_LIB_SYS == 2, "library value changed");
+static_assert(ERR_LIB_BN == 3, "library value changed");
+static_assert(ERR_LIB_RSA == 4, "library value changed");
+static_assert(ERR_LIB_DH == 5, "library value changed");
+static_assert(ERR_LIB_EVP == 6, "library value changed");
+static_assert(ERR_LIB_BUF == 7, "library value changed");
+static_assert(ERR_LIB_OBJ == 8, "library value changed");
+static_assert(ERR_LIB_PEM == 9, "library value changed");
+static_assert(ERR_LIB_DSA == 10, "library value changed");
+static_assert(ERR_LIB_X509 == 11, "library value changed");
+static_assert(ERR_LIB_ASN1 == 12, "library value changed");
+static_assert(ERR_LIB_CONF == 13, "library value changed");
+static_assert(ERR_LIB_CRYPTO == 14, "library value changed");
+static_assert(ERR_LIB_EC == 15, "library value changed");
+static_assert(ERR_LIB_SSL == 16, "library value changed");
+static_assert(ERR_LIB_BIO == 17, "library value changed");
+static_assert(ERR_LIB_PKCS7 == 18, "library value changed");
+static_assert(ERR_LIB_PKCS8 == 19, "library value changed");
+static_assert(ERR_LIB_X509V3 == 20, "library value changed");
+static_assert(ERR_LIB_RAND == 21, "library value changed");
+static_assert(ERR_LIB_ENGINE == 22, "library value changed");
+static_assert(ERR_LIB_OCSP == 23, "library value changed");
+static_assert(ERR_LIB_UI == 24, "library value changed");
+static_assert(ERR_LIB_COMP == 25, "library value changed");
+static_assert(ERR_LIB_ECDSA == 26, "library value changed");
+static_assert(ERR_LIB_ECDH == 27, "library value changed");
+static_assert(ERR_LIB_HMAC == 28, "library value changed");
+static_assert(ERR_LIB_DIGEST == 29, "library value changed");
+static_assert(ERR_LIB_CIPHER == 30, "library value changed");
+static_assert(ERR_LIB_HKDF == 31, "library value changed");
+static_assert(ERR_LIB_TRUST_TOKEN == 32, "library value changed");
+static_assert(ERR_LIB_USER == 33, "library value changed");
+static_assert(ERR_NUM_LIBS == 34, "number of libraries changed");
+
+const uint32_t kOpenSSLReasonValues[] = {
+ 0xc320885,
+ 0xc32889f,
+ 0xc3308ae,
+ 0xc3388be,
+ 0xc3408cd,
+ 0xc3488e6,
+ 0xc3508f2,
+ 0xc35890f,
+ 0xc36092f,
+ 0xc36893d,
+ 0xc37094d,
+ 0xc37895a,
+ 0xc38096a,
+ 0xc388975,
+ 0xc39098b,
+ 0xc39899a,
+ 0xc3a09ae,
+ 0xc3a8892,
+ 0xc3b00f7,
+ 0xc3b8921,
+ 0x10320892,
+ 0x10329641,
+ 0x1033164d,
+ 0x10339666,
+ 0x10341679,
+ 0x10348d4e,
+ 0x10350cdf,
+ 0x1035968c,
+ 0x103616b6,
+ 0x103696c9,
+ 0x103716e8,
+ 0x10379701,
+ 0x10381716,
+ 0x10389734,
+ 0x10391743,
+ 0x1039975f,
+ 0x103a177a,
+ 0x103a9789,
+ 0x103b17a5,
+ 0x103b97c0,
+ 0x103c17e6,
+ 0x103c80f7,
+ 0x103d17f7,
+ 0x103d980b,
+ 0x103e182a,
+ 0x103e9839,
+ 0x103f1850,
+ 0x103f9863,
+ 0x10400ca3,
+ 0x10409876,
+ 0x10411894,
+ 0x104198a7,
+ 0x104218c1,
+ 0x104298d1,
+ 0x104318e5,
+ 0x104398fb,
+ 0x10441913,
+ 0x10449928,
+ 0x1045193c,
+ 0x1045994e,
+ 0x10460635,
+ 0x1046899a,
+ 0x10471963,
+ 0x1047997a,
+ 0x1048198f,
+ 0x1048999d,
+ 0x10490f57,
+ 0x104997d7,
+ 0x104a16a1,
+ 0x14320c73,
+ 0x14328c94,
+ 0x14330ca3,
+ 0x14338cb5,
+ 0x143400b9,
+ 0x143480f7,
+ 0x14350c81,
+ 0x18320090,
+ 0x18328fe9,
+ 0x183300b9,
+ 0x18338fff,
+ 0x18341013,
+ 0x183480f7,
+ 0x18351032,
+ 0x1835904a,
+ 0x1836105f,
+ 0x18369073,
+ 0x183710ab,
+ 0x183790c1,
+ 0x183810d5,
+ 0x183890e5,
+ 0x18390ac0,
+ 0x183990f5,
+ 0x183a111b,
+ 0x183a9141,
+ 0x183b0ceb,
+ 0x183b9190,
+ 0x183c11a2,
+ 0x183c91ad,
+ 0x183d11bd,
+ 0x183d91ce,
+ 0x183e11df,
+ 0x183e91f1,
+ 0x183f121a,
+ 0x183f9233,
+ 0x1840124b,
+ 0x1840870d,
+ 0x18411164,
+ 0x1841912f,
+ 0x1842114e,
+ 0x18428c81,
+ 0x1843110a,
+ 0x18439176,
+ 0x18441028,
+ 0x18449097,
+ 0x20321285,
+ 0x20329272,
+ 0x24321291,
+ 0x243289e0,
+ 0x243312a3,
+ 0x243392b0,
+ 0x243412bd,
+ 0x243492cf,
+ 0x243512de,
+ 0x243592fb,
+ 0x24361308,
+ 0x24369316,
+ 0x24371324,
+ 0x24379332,
+ 0x2438133b,
+ 0x24389348,
+ 0x2439135b,
+ 0x28320cd3,
+ 0x28328ceb,
+ 0x28330ca3,
+ 0x28338cfe,
+ 0x28340cdf,
+ 0x283480b9,
+ 0x283500f7,
+ 0x28358c81,
+ 0x2836099a,
+ 0x2c3232e7,
+ 0x2c329372,
+ 0x2c3332f5,
+ 0x2c33b307,
+ 0x2c34331b,
+ 0x2c34b32d,
+ 0x2c353348,
+ 0x2c35b35a,
+ 0x2c36338a,
+ 0x2c36833a,
+ 0x2c373397,
+ 0x2c37b3c3,
+ 0x2c383401,
+ 0x2c38b418,
+ 0x2c393436,
+ 0x2c39b446,
+ 0x2c3a3458,
+ 0x2c3ab46c,
+ 0x2c3b347d,
+ 0x2c3bb49c,
+ 0x2c3c1384,
+ 0x2c3c939a,
+ 0x2c3d34e1,
+ 0x2c3d93b3,
+ 0x2c3e350b,
+ 0x2c3eb519,
+ 0x2c3f3531,
+ 0x2c3fb549,
+ 0x2c403573,
+ 0x2c409285,
+ 0x2c413584,
+ 0x2c41b597,
+ 0x2c42124b,
+ 0x2c42b5a8,
+ 0x2c43076d,
+ 0x2c43b48e,
+ 0x2c4433d6,
+ 0x2c44b556,
+ 0x2c45336d,
+ 0x2c45b3a9,
+ 0x2c463426,
+ 0x2c46b4b0,
+ 0x2c4734c5,
+ 0x2c47b4fe,
+ 0x2c4833e8,
+ 0x30320000,
+ 0x30328015,
+ 0x3033001f,
+ 0x30338038,
+ 0x30340057,
+ 0x30348071,
+ 0x30350078,
+ 0x30358090,
+ 0x303600a1,
+ 0x303680b9,
+ 0x303700c6,
+ 0x303780d5,
+ 0x303800f7,
+ 0x30388104,
+ 0x30390117,
+ 0x30398132,
+ 0x303a0147,
+ 0x303a815b,
+ 0x303b016f,
+ 0x303b8180,
+ 0x303c0199,
+ 0x303c81b6,
+ 0x303d01c4,
+ 0x303d81d8,
+ 0x303e01e8,
+ 0x303e8201,
+ 0x303f0211,
+ 0x303f8224,
+ 0x30400233,
+ 0x3040823f,
+ 0x30410254,
+ 0x30418264,
+ 0x3042027b,
+ 0x30428288,
+ 0x3043029b,
+ 0x304382aa,
+ 0x304402bf,
+ 0x304482e0,
+ 0x304502f3,
+ 0x30458306,
+ 0x3046031f,
+ 0x3046833a,
+ 0x30470372,
+ 0x30478384,
+ 0x304803a2,
+ 0x304883b3,
+ 0x304903c2,
+ 0x304983da,
+ 0x304a03ec,
+ 0x304a8400,
+ 0x304b0418,
+ 0x304b842b,
+ 0x304c0436,
+ 0x304c8447,
+ 0x304d0453,
+ 0x304d8469,
+ 0x304e0477,
+ 0x304e848d,
+ 0x304f049f,
+ 0x304f84b1,
+ 0x305004d4,
+ 0x305084e7,
+ 0x305104f8,
+ 0x30518508,
+ 0x30520520,
+ 0x30528535,
+ 0x3053054d,
+ 0x30538561,
+ 0x30540579,
+ 0x30548592,
+ 0x305505ab,
+ 0x305585c8,
+ 0x305605d3,
+ 0x305685eb,
+ 0x305705fb,
+ 0x3057860c,
+ 0x3058061f,
+ 0x30588635,
+ 0x3059063e,
+ 0x30598653,
+ 0x305a0666,
+ 0x305a8675,
+ 0x305b0695,
+ 0x305b86a4,
+ 0x305c06c5,
+ 0x305c86e1,
+ 0x305d06ed,
+ 0x305d870d,
+ 0x305e0729,
+ 0x305e874d,
+ 0x305f0763,
+ 0x305f876d,
+ 0x306004c4,
+ 0x3060804a,
+ 0x30610357,
+ 0x3061873a,
+ 0x30620392,
+ 0x34320bb0,
+ 0x34328bc4,
+ 0x34330be1,
+ 0x34338bf4,
+ 0x34340c03,
+ 0x34348c5d,
+ 0x34350c41,
+ 0x34358c20,
+ 0x3c320090,
+ 0x3c328da0,
+ 0x3c330db9,
+ 0x3c338dd4,
+ 0x3c340df1,
+ 0x3c348e1b,
+ 0x3c350e36,
+ 0x3c358e5c,
+ 0x3c360e75,
+ 0x3c368e8d,
+ 0x3c370e9e,
+ 0x3c378eac,
+ 0x3c380eb9,
+ 0x3c388ecd,
+ 0x3c390ceb,
+ 0x3c398ef0,
+ 0x3c3a0f04,
+ 0x3c3a895a,
+ 0x3c3b0f14,
+ 0x3c3b8f2f,
+ 0x3c3c0f41,
+ 0x3c3c8f74,
+ 0x3c3d0f7e,
+ 0x3c3d8f92,
+ 0x3c3e0fa0,
+ 0x3c3e8fc5,
+ 0x3c3f0d8c,
+ 0x3c3f8fae,
+ 0x3c4000b9,
+ 0x3c4080f7,
+ 0x3c410e0c,
+ 0x3c418e4b,
+ 0x3c420f57,
+ 0x3c428ee1,
+ 0x40321a2f,
+ 0x40329a45,
+ 0x40331a73,
+ 0x40339a7d,
+ 0x40341a94,
+ 0x40349ab2,
+ 0x40351ac2,
+ 0x40359ad4,
+ 0x40361ae1,
+ 0x40369aed,
+ 0x40371b02,
+ 0x40379b14,
+ 0x40381b1f,
+ 0x40389b31,
+ 0x40390d4e,
+ 0x40399b41,
+ 0x403a1b54,
+ 0x403a9b75,
+ 0x403b1b86,
+ 0x403b9b96,
+ 0x403c0071,
+ 0x403c8090,
+ 0x403d1bf7,
+ 0x403d9c0d,
+ 0x403e1c1c,
+ 0x403e9c54,
+ 0x403f1c6e,
+ 0x403f9c96,
+ 0x40401cab,
+ 0x40409cbf,
+ 0x40411cfa,
+ 0x40419d15,
+ 0x40421d2e,
+ 0x40429d41,
+ 0x40431d55,
+ 0x40439d83,
+ 0x40441d9a,
+ 0x404480b9,
+ 0x40451daf,
+ 0x40459dc1,
+ 0x40461de5,
+ 0x40469e05,
+ 0x40471e13,
+ 0x40479e3a,
+ 0x40481eab,
+ 0x40489f65,
+ 0x40491f7c,
+ 0x40499f96,
+ 0x404a1fad,
+ 0x404a9fcb,
+ 0x404b1fe3,
+ 0x404ba010,
+ 0x404c2026,
+ 0x404ca038,
+ 0x404d2059,
+ 0x404da092,
+ 0x404e20a6,
+ 0x404ea0b3,
+ 0x404f2164,
+ 0x404fa1da,
+ 0x40502249,
+ 0x4050a25d,
+ 0x40512290,
+ 0x405222a0,
+ 0x4052a2c4,
+ 0x405322dc,
+ 0x4053a2ef,
+ 0x40542304,
+ 0x4054a327,
+ 0x40552352,
+ 0x4055a38f,
+ 0x405623b4,
+ 0x4056a3cd,
+ 0x405723e5,
+ 0x4057a3f8,
+ 0x4058240d,
+ 0x4058a434,
+ 0x40592463,
+ 0x4059a490,
+ 0x405aa4a4,
+ 0x405b24bc,
+ 0x405ba4cd,
+ 0x405c24e0,
+ 0x405ca51f,
+ 0x405d252c,
+ 0x405da551,
+ 0x405e258f,
+ 0x405e8afe,
+ 0x405f25b0,
+ 0x405fa5bd,
+ 0x406025cb,
+ 0x4060a5ed,
+ 0x4061264e,
+ 0x4061a686,
+ 0x4062269d,
+ 0x4062a6ae,
+ 0x406326fb,
+ 0x4063a710,
+ 0x40642727,
+ 0x4064a753,
+ 0x4065276e,
+ 0x4065a785,
+ 0x4066279d,
+ 0x4066a7c7,
+ 0x406727f2,
+ 0x4067a837,
+ 0x4068287f,
+ 0x4068a8a0,
+ 0x406928d2,
+ 0x4069a900,
+ 0x406a2921,
+ 0x406aa941,
+ 0x406b2ac9,
+ 0x406baaec,
+ 0x406c2b02,
+ 0x406cae0c,
+ 0x406d2e3b,
+ 0x406dae63,
+ 0x406e2e91,
+ 0x406eaede,
+ 0x406f2f37,
+ 0x406faf6f,
+ 0x40702f82,
+ 0x4070af9f,
+ 0x4071084d,
+ 0x4071afb1,
+ 0x40722fc4,
+ 0x4072affa,
+ 0x40733012,
+ 0x4073959c,
+ 0x40743026,
+ 0x4074b040,
+ 0x40753051,
+ 0x4075b065,
+ 0x40763073,
+ 0x40769348,
+ 0x40773098,
+ 0x4077b0d8,
+ 0x407830f3,
+ 0x4078b12c,
+ 0x40793143,
+ 0x4079b159,
+ 0x407a3185,
+ 0x407ab198,
+ 0x407b31ad,
+ 0x407bb1bf,
+ 0x407c31f0,
+ 0x407cb1f9,
+ 0x407d28bb,
+ 0x407da202,
+ 0x407e3108,
+ 0x407ea444,
+ 0x407f1e27,
+ 0x407f9ffa,
+ 0x40802174,
+ 0x40809e4f,
+ 0x408122b2,
+ 0x4081a101,
+ 0x40822e7c,
+ 0x40829ba2,
+ 0x4083241f,
+ 0x4083a738,
+ 0x40841e63,
+ 0x4084a47c,
+ 0x408524f1,
+ 0x4085a615,
+ 0x40862571,
+ 0x4086a21c,
+ 0x40872ec2,
+ 0x4087a663,
+ 0x40881be0,
+ 0x4088a84a,
+ 0x40891c2f,
+ 0x40899bbc,
+ 0x408a2b3a,
+ 0x408a99b4,
+ 0x408b31d4,
+ 0x408baf4c,
+ 0x408c2501,
+ 0x408c99ec,
+ 0x408d1f4b,
+ 0x408d9e95,
+ 0x408e207b,
+ 0x408ea36f,
+ 0x408f285e,
+ 0x408fa631,
+ 0x40902813,
+ 0x4090a543,
+ 0x40912b22,
+ 0x40919a12,
+ 0x40921c7c,
+ 0x4092aefd,
+ 0x40932fdd,
+ 0x4093a22d,
+ 0x40941e77,
+ 0x4094ab53,
+ 0x409526bf,
+ 0x4095b165,
+ 0x40962ea9,
+ 0x4096a18d,
+ 0x40972278,
+ 0x4097a0ca,
+ 0x40981cdc,
+ 0x4098a6d3,
+ 0x40992f19,
+ 0x4099a39c,
+ 0x409a2335,
+ 0x409a99d0,
+ 0x409b1ed1,
+ 0x409b9efc,
+ 0x409c30ba,
+ 0x409c9f24,
+ 0x409d2149,
+ 0x409da117,
+ 0x409e1d6d,
+ 0x409ea1c2,
+ 0x409f21aa,
+ 0x409f9ec4,
+ 0x40a021ea,
+ 0x40a0a0e4,
+ 0x40a12132,
+ 0x41f429f4,
+ 0x41f92a86,
+ 0x41fe2979,
+ 0x41feac2f,
+ 0x41ff2d5d,
+ 0x42032a0d,
+ 0x42082a2f,
+ 0x4208aa6b,
+ 0x4209295d,
+ 0x4209aaa5,
+ 0x420a29b4,
+ 0x420aa994,
+ 0x420b29d4,
+ 0x420baa4d,
+ 0x420c2d79,
+ 0x420cab63,
+ 0x420d2c16,
+ 0x420dac4d,
+ 0x42122c80,
+ 0x42172d40,
+ 0x4217acc2,
+ 0x421c2ce4,
+ 0x421f2c9f,
+ 0x42212df1,
+ 0x42262d23,
+ 0x422b2dcf,
+ 0x422babf1,
+ 0x422c2db1,
+ 0x422caba4,
+ 0x422d2b7d,
+ 0x422dad90,
+ 0x422e2bd0,
+ 0x42302cff,
+ 0x4230ac67,
+ 0x44320778,
+ 0x44328787,
+ 0x44330793,
+ 0x443387a1,
+ 0x443407b4,
+ 0x443487c5,
+ 0x443507cc,
+ 0x443587d6,
+ 0x443607e9,
+ 0x443687ff,
+ 0x44370811,
+ 0x4437881e,
+ 0x4438082d,
+ 0x44388835,
+ 0x4439084d,
+ 0x4439885b,
+ 0x443a086e,
+ 0x48321372,
+ 0x48329384,
+ 0x4833139a,
+ 0x483393b3,
+ 0x4c3213f0,
+ 0x4c329400,
+ 0x4c331413,
+ 0x4c339433,
+ 0x4c3400b9,
+ 0x4c3480f7,
+ 0x4c35143f,
+ 0x4c35944d,
+ 0x4c361469,
+ 0x4c36948f,
+ 0x4c37149e,
+ 0x4c3794ac,
+ 0x4c3814c1,
+ 0x4c3894cd,
+ 0x4c3914ed,
+ 0x4c399517,
+ 0x4c3a1530,
+ 0x4c3a9549,
+ 0x4c3b0635,
+ 0x4c3b9562,
+ 0x4c3c1574,
+ 0x4c3c9583,
+ 0x4c3d159c,
+ 0x4c3d8cc6,
+ 0x4c3e1609,
+ 0x4c3e95ab,
+ 0x4c3f162b,
+ 0x4c3f9348,
+ 0x4c4015c1,
+ 0x4c4093dc,
+ 0x4c4115f9,
+ 0x4c41947c,
+ 0x4c4215e5,
+ 0x4c4293c4,
+ 0x503235ba,
+ 0x5032b5c9,
+ 0x503335d4,
+ 0x5033b5e4,
+ 0x503435fd,
+ 0x5034b617,
+ 0x50353625,
+ 0x5035b63b,
+ 0x5036364d,
+ 0x5036b663,
+ 0x5037367c,
+ 0x5037b68f,
+ 0x503836a7,
+ 0x5038b6b8,
+ 0x503936cd,
+ 0x5039b6e1,
+ 0x503a3701,
+ 0x503ab717,
+ 0x503b372f,
+ 0x503bb741,
+ 0x503c375d,
+ 0x503cb774,
+ 0x503d378d,
+ 0x503db7a3,
+ 0x503e37b0,
+ 0x503eb7c6,
+ 0x503f37d8,
+ 0x503f83b3,
+ 0x504037eb,
+ 0x5040b7fb,
+ 0x50413815,
+ 0x5041b824,
+ 0x5042383e,
+ 0x5042b85b,
+ 0x5043386b,
+ 0x5043b87b,
+ 0x50443898,
+ 0x50448469,
+ 0x504538ac,
+ 0x5045b8ca,
+ 0x504638dd,
+ 0x5046b8f3,
+ 0x50473905,
+ 0x5047b91a,
+ 0x50483940,
+ 0x5048b94e,
+ 0x50493961,
+ 0x5049b976,
+ 0x504a398c,
+ 0x504ab99c,
+ 0x504b39bc,
+ 0x504bb9cf,
+ 0x504c39f2,
+ 0x504cba20,
+ 0x504d3a4d,
+ 0x504dba6a,
+ 0x504e3a85,
+ 0x504ebaa1,
+ 0x504f3ab3,
+ 0x504fbaca,
+ 0x50503ad9,
+ 0x50508729,
+ 0x50513aec,
+ 0x5051b88a,
+ 0x50523a32,
+ 0x58320fd1,
+ 0x68320d4e,
+ 0x68328ceb,
+ 0x68330cfe,
+ 0x68338d5c,
+ 0x68340d6c,
+ 0x683480f7,
+ 0x6835099a,
+ 0x6c320d14,
+ 0x6c328cb5,
+ 0x6c330d1f,
+ 0x6c338d38,
+ 0x74320a66,
+ 0x743280b9,
+ 0x74330cc6,
+ 0x783209cb,
+ 0x783289e0,
+ 0x783309ec,
+ 0x78338090,
+ 0x783409fb,
+ 0x78348a10,
+ 0x78350a2f,
+ 0x78358a51,
+ 0x78360a66,
+ 0x78368a7c,
+ 0x78370a8c,
+ 0x78378aad,
+ 0x78380ac0,
+ 0x78388ad2,
+ 0x78390adf,
+ 0x78398afe,
+ 0x783a0b13,
+ 0x783a8b21,
+ 0x783b0b2b,
+ 0x783b8b3f,
+ 0x783c0b56,
+ 0x783c8b6b,
+ 0x783d0b82,
+ 0x783d8b97,
+ 0x783e0aed,
+ 0x783e8a9f,
+ 0x7c321261,
+ 0x8032148f,
+ 0x80328090,
+ 0x803332b6,
+ 0x803380b9,
+ 0x803432c5,
+ 0x8034b22d,
+ 0x8035324b,
+ 0x8035b2d9,
+ 0x8036328d,
+ 0x8036b23c,
+ 0x8037327f,
+ 0x8037b21a,
+ 0x803832a0,
+ 0x8038b25c,
+ 0x80393271,
+};
+
+const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]);
+
+const char kOpenSSLReasonStringData[] =
+ "ASN1_LENGTH_MISMATCH\0"
+ "AUX_ERROR\0"
+ "BAD_GET_ASN1_OBJECT_CALL\0"
+ "BAD_OBJECT_HEADER\0"
+ "BAD_TEMPLATE\0"
+ "BMPSTRING_IS_WRONG_LENGTH\0"
+ "BN_LIB\0"
+ "BOOLEAN_IS_WRONG_LENGTH\0"
+ "BUFFER_TOO_SMALL\0"
+ "CONTEXT_NOT_INITIALISED\0"
+ "DECODE_ERROR\0"
+ "DEPTH_EXCEEDED\0"
+ "DIGEST_AND_KEY_TYPE_NOT_SUPPORTED\0"
+ "ENCODE_ERROR\0"
+ "ERROR_GETTING_TIME\0"
+ "EXPECTING_AN_ASN1_SEQUENCE\0"
+ "EXPECTING_AN_INTEGER\0"
+ "EXPECTING_AN_OBJECT\0"
+ "EXPECTING_A_BOOLEAN\0"
+ "EXPECTING_A_TIME\0"
+ "EXPLICIT_LENGTH_MISMATCH\0"
+ "EXPLICIT_TAG_NOT_CONSTRUCTED\0"
+ "FIELD_MISSING\0"
+ "FIRST_NUM_TOO_LARGE\0"
+ "HEADER_TOO_LONG\0"
+ "ILLEGAL_BITSTRING_FORMAT\0"
+ "ILLEGAL_BOOLEAN\0"
+ "ILLEGAL_CHARACTERS\0"
+ "ILLEGAL_FORMAT\0"
+ "ILLEGAL_HEX\0"
+ "ILLEGAL_IMPLICIT_TAG\0"
+ "ILLEGAL_INTEGER\0"
+ "ILLEGAL_NESTED_TAGGING\0"
+ "ILLEGAL_NULL\0"
+ "ILLEGAL_NULL_VALUE\0"
+ "ILLEGAL_OBJECT\0"
+ "ILLEGAL_OPTIONAL_ANY\0"
+ "ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE\0"
+ "ILLEGAL_TAGGED_ANY\0"
+ "ILLEGAL_TIME_VALUE\0"
+ "INTEGER_NOT_ASCII_FORMAT\0"
+ "INTEGER_TOO_LARGE_FOR_LONG\0"
+ "INVALID_BIT_STRING_BITS_LEFT\0"
+ "INVALID_BIT_STRING_PADDING\0"
+ "INVALID_BMPSTRING\0"
+ "INVALID_DIGIT\0"
+ "INVALID_INTEGER\0"
+ "INVALID_MODIFIER\0"
+ "INVALID_NUMBER\0"
+ "INVALID_OBJECT_ENCODING\0"
+ "INVALID_SEPARATOR\0"
+ "INVALID_TIME_FORMAT\0"
+ "INVALID_UNIVERSALSTRING\0"
+ "INVALID_UTF8STRING\0"
+ "LIST_ERROR\0"
+ "MISSING_ASN1_EOS\0"
+ "MISSING_EOC\0"
+ "MISSING_SECOND_NUMBER\0"
+ "MISSING_VALUE\0"
+ "MSTRING_NOT_UNIVERSAL\0"
+ "MSTRING_WRONG_TAG\0"
+ "NESTED_ASN1_ERROR\0"
+ "NESTED_ASN1_STRING\0"
+ "NESTED_TOO_DEEP\0"
+ "NON_HEX_CHARACTERS\0"
+ "NOT_ASCII_FORMAT\0"
+ "NOT_ENOUGH_DATA\0"
+ "NO_MATCHING_CHOICE_TYPE\0"
+ "NULL_IS_WRONG_LENGTH\0"
+ "OBJECT_NOT_ASCII_FORMAT\0"
+ "ODD_NUMBER_OF_CHARS\0"
+ "SECOND_NUMBER_TOO_LARGE\0"
+ "SEQUENCE_LENGTH_MISMATCH\0"
+ "SEQUENCE_NOT_CONSTRUCTED\0"
+ "SEQUENCE_OR_SET_NEEDS_CONFIG\0"
+ "SHORT_LINE\0"
+ "STREAMING_NOT_SUPPORTED\0"
+ "STRING_TOO_LONG\0"
+ "STRING_TOO_SHORT\0"
+ "TAG_VALUE_TOO_HIGH\0"
+ "TIME_NOT_ASCII_FORMAT\0"
+ "TOO_LONG\0"
+ "TYPE_NOT_CONSTRUCTED\0"
+ "TYPE_NOT_PRIMITIVE\0"
+ "UNEXPECTED_EOC\0"
+ "UNIVERSALSTRING_IS_WRONG_LENGTH\0"
+ "UNKNOWN_FORMAT\0"
+ "UNKNOWN_MESSAGE_DIGEST_ALGORITHM\0"
+ "UNKNOWN_SIGNATURE_ALGORITHM\0"
+ "UNKNOWN_TAG\0"
+ "UNSUPPORTED_ANY_DEFINED_BY_TYPE\0"
+ "UNSUPPORTED_PUBLIC_KEY_TYPE\0"
+ "UNSUPPORTED_TYPE\0"
+ "WRONG_INTEGER_TYPE\0"
+ "WRONG_PUBLIC_KEY_TYPE\0"
+ "WRONG_TAG\0"
+ "WRONG_TYPE\0"
+ "BAD_FOPEN_MODE\0"
+ "BROKEN_PIPE\0"
+ "CONNECT_ERROR\0"
+ "ERROR_SETTING_NBIO\0"
+ "INVALID_ARGUMENT\0"
+ "IN_USE\0"
+ "KEEPALIVE\0"
+ "NBIO_CONNECT_ERROR\0"
+ "NO_HOSTNAME_SPECIFIED\0"
+ "NO_PORT_SPECIFIED\0"
+ "NO_SUCH_FILE\0"
+ "NULL_PARAMETER\0"
+ "SYS_LIB\0"
+ "UNABLE_TO_CREATE_SOCKET\0"
+ "UNINITIALIZED\0"
+ "UNSUPPORTED_METHOD\0"
+ "WRITE_TO_READ_ONLY_BIO\0"
+ "ARG2_LT_ARG3\0"
+ "BAD_ENCODING\0"
+ "BAD_RECIPROCAL\0"
+ "BIGNUM_TOO_LONG\0"
+ "BITS_TOO_SMALL\0"
+ "CALLED_WITH_EVEN_MODULUS\0"
+ "DIV_BY_ZERO\0"
+ "EXPAND_ON_STATIC_BIGNUM_DATA\0"
+ "INPUT_NOT_REDUCED\0"
+ "INVALID_INPUT\0"
+ "INVALID_RANGE\0"
+ "NEGATIVE_NUMBER\0"
+ "NOT_A_SQUARE\0"
+ "NOT_INITIALIZED\0"
+ "NO_INVERSE\0"
+ "PRIVATE_KEY_TOO_LARGE\0"
+ "P_IS_NOT_PRIME\0"
+ "TOO_MANY_ITERATIONS\0"
+ "TOO_MANY_TEMPORARY_VARIABLES\0"
+ "AES_KEY_SETUP_FAILED\0"
+ "BAD_DECRYPT\0"
+ "BAD_KEY_LENGTH\0"
+ "CTRL_NOT_IMPLEMENTED\0"
+ "CTRL_OPERATION_NOT_IMPLEMENTED\0"
+ "DATA_NOT_MULTIPLE_OF_BLOCK_LENGTH\0"
+ "INITIALIZATION_ERROR\0"
+ "INPUT_NOT_INITIALIZED\0"
+ "INVALID_AD_SIZE\0"
+ "INVALID_KEY_LENGTH\0"
+ "INVALID_NONCE\0"
+ "INVALID_NONCE_SIZE\0"
+ "INVALID_OPERATION\0"
+ "IV_TOO_LARGE\0"
+ "NO_CIPHER_SET\0"
+ "NO_DIRECTION_SET\0"
+ "OUTPUT_ALIASES_INPUT\0"
+ "TAG_TOO_LARGE\0"
+ "TOO_LARGE\0"
+ "UNSUPPORTED_AD_SIZE\0"
+ "UNSUPPORTED_INPUT_SIZE\0"
+ "UNSUPPORTED_KEY_SIZE\0"
+ "UNSUPPORTED_NONCE_SIZE\0"
+ "UNSUPPORTED_TAG_SIZE\0"
+ "WRONG_FINAL_BLOCK_LENGTH\0"
+ "LIST_CANNOT_BE_NULL\0"
+ "MISSING_CLOSE_SQUARE_BRACKET\0"
+ "MISSING_EQUAL_SIGN\0"
+ "NO_CLOSE_BRACE\0"
+ "UNABLE_TO_CREATE_NEW_SECTION\0"
+ "VARIABLE_EXPANSION_NOT_SUPPORTED\0"
+ "VARIABLE_EXPANSION_TOO_LONG\0"
+ "VARIABLE_HAS_NO_VALUE\0"
+ "BAD_GENERATOR\0"
+ "INVALID_PARAMETERS\0"
+ "INVALID_PUBKEY\0"
+ "MODULUS_TOO_LARGE\0"
+ "NO_PRIVATE_VALUE\0"
+ "UNKNOWN_HASH\0"
+ "BAD_Q_VALUE\0"
+ "BAD_VERSION\0"
+ "MISSING_PARAMETERS\0"
+ "NEED_NEW_SETUP_VALUES\0"
+ "KDF_FAILED\0"
+ "POINT_ARITHMETIC_FAILURE\0"
+ "UNKNOWN_DIGEST_LENGTH\0"
+ "BAD_SIGNATURE\0"
+ "NOT_IMPLEMENTED\0"
+ "RANDOM_NUMBER_GENERATION_FAILED\0"
+ "BIGNUM_OUT_OF_RANGE\0"
+ "COORDINATES_OUT_OF_RANGE\0"
+ "D2I_ECPKPARAMETERS_FAILURE\0"
+ "EC_GROUP_NEW_BY_NAME_FAILURE\0"
+ "GROUP2PKPARAMETERS_FAILURE\0"
+ "GROUP_MISMATCH\0"
+ "I2D_ECPKPARAMETERS_FAILURE\0"
+ "INCOMPATIBLE_OBJECTS\0"
+ "INVALID_COFACTOR\0"
+ "INVALID_COMPRESSED_POINT\0"
+ "INVALID_COMPRESSION_BIT\0"
+ "INVALID_ENCODING\0"
+ "INVALID_FIELD\0"
+ "INVALID_FORM\0"
+ "INVALID_GROUP_ORDER\0"
+ "INVALID_PRIVATE_KEY\0"
+ "INVALID_SCALAR\0"
+ "MISSING_PRIVATE_KEY\0"
+ "NON_NAMED_CURVE\0"
+ "PKPARAMETERS2GROUP_FAILURE\0"
+ "POINT_AT_INFINITY\0"
+ "POINT_IS_NOT_ON_CURVE\0"
+ "PUBLIC_KEY_VALIDATION_FAILED\0"
+ "SLOT_FULL\0"
+ "UNDEFINED_GENERATOR\0"
+ "UNKNOWN_GROUP\0"
+ "UNKNOWN_ORDER\0"
+ "WRONG_CURVE_PARAMETERS\0"
+ "WRONG_ORDER\0"
+ "OPERATION_NOT_SUPPORTED\0"
+ "COMMAND_NOT_SUPPORTED\0"
+ "DIFFERENT_KEY_TYPES\0"
+ "DIFFERENT_PARAMETERS\0"
+ "EMPTY_PSK\0"
+ "EXPECTING_AN_EC_KEY_KEY\0"
+ "EXPECTING_AN_RSA_KEY\0"
+ "EXPECTING_A_DSA_KEY\0"
+ "ILLEGAL_OR_UNSUPPORTED_PADDING_MODE\0"
+ "INVALID_BUFFER_SIZE\0"
+ "INVALID_DIGEST_LENGTH\0"
+ "INVALID_DIGEST_TYPE\0"
+ "INVALID_KEYBITS\0"
+ "INVALID_MGF1_MD\0"
+ "INVALID_PADDING_MODE\0"
+ "INVALID_PEER_KEY\0"
+ "INVALID_PSS_SALTLEN\0"
+ "INVALID_SIGNATURE\0"
+ "KEYS_NOT_SET\0"
+ "MEMORY_LIMIT_EXCEEDED\0"
+ "NOT_A_PRIVATE_KEY\0"
+ "NOT_XOF_OR_INVALID_LENGTH\0"
+ "NO_DEFAULT_DIGEST\0"
+ "NO_KEY_SET\0"
+ "NO_MDC2_SUPPORT\0"
+ "NO_NID_FOR_CURVE\0"
+ "NO_OPERATION_SET\0"
+ "NO_PARAMETERS_SET\0"
+ "OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE\0"
+ "OPERATON_NOT_INITIALIZED\0"
+ "UNKNOWN_PUBLIC_KEY_TYPE\0"
+ "UNSUPPORTED_ALGORITHM\0"
+ "OUTPUT_TOO_LARGE\0"
+ "INVALID_OID_STRING\0"
+ "UNKNOWN_NID\0"
+ "BAD_BASE64_DECODE\0"
+ "BAD_END_LINE\0"
+ "BAD_IV_CHARS\0"
+ "BAD_PASSWORD_READ\0"
+ "CIPHER_IS_NULL\0"
+ "ERROR_CONVERTING_PRIVATE_KEY\0"
+ "NOT_DEK_INFO\0"
+ "NOT_ENCRYPTED\0"
+ "NOT_PROC_TYPE\0"
+ "NO_START_LINE\0"
+ "READ_KEY\0"
+ "SHORT_HEADER\0"
+ "UNSUPPORTED_CIPHER\0"
+ "UNSUPPORTED_ENCRYPTION\0"
+ "BAD_PKCS7_VERSION\0"
+ "NOT_PKCS7_SIGNED_DATA\0"
+ "NO_CERTIFICATES_INCLUDED\0"
+ "NO_CRLS_INCLUDED\0"
+ "AMBIGUOUS_FRIENDLY_NAME\0"
+ "BAD_ITERATION_COUNT\0"
+ "BAD_PKCS12_DATA\0"
+ "BAD_PKCS12_VERSION\0"
+ "CIPHER_HAS_NO_OBJECT_IDENTIFIER\0"
+ "CRYPT_ERROR\0"
+ "ENCRYPT_ERROR\0"
+ "ERROR_SETTING_CIPHER_PARAMS\0"
+ "INCORRECT_PASSWORD\0"
+ "INVALID_CHARACTERS\0"
+ "KEYGEN_FAILURE\0"
+ "KEY_GEN_ERROR\0"
+ "METHOD_NOT_SUPPORTED\0"
+ "MISSING_MAC\0"
+ "MULTIPLE_PRIVATE_KEYS_IN_PKCS12\0"
+ "PKCS12_PUBLIC_KEY_INTEGRITY_NOT_SUPPORTED\0"
+ "PKCS12_TOO_DEEPLY_NESTED\0"
+ "PRIVATE_KEY_DECODE_ERROR\0"
+ "PRIVATE_KEY_ENCODE_ERROR\0"
+ "UNKNOWN_ALGORITHM\0"
+ "UNKNOWN_CIPHER\0"
+ "UNKNOWN_CIPHER_ALGORITHM\0"
+ "UNKNOWN_DIGEST\0"
+ "UNSUPPORTED_KEYLENGTH\0"
+ "UNSUPPORTED_KEY_DERIVATION_FUNCTION\0"
+ "UNSUPPORTED_OPTIONS\0"
+ "UNSUPPORTED_PRF\0"
+ "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0"
+ "UNSUPPORTED_SALT_TYPE\0"
+ "BAD_E_VALUE\0"
+ "BAD_FIXED_HEADER_DECRYPT\0"
+ "BAD_PAD_BYTE_COUNT\0"
+ "BAD_RSA_PARAMETERS\0"
+ "BLOCK_TYPE_IS_NOT_01\0"
+ "BLOCK_TYPE_IS_NOT_02\0"
+ "BN_NOT_INITIALIZED\0"
+ "CANNOT_RECOVER_MULTI_PRIME_KEY\0"
+ "CRT_PARAMS_ALREADY_GIVEN\0"
+ "CRT_VALUES_INCORRECT\0"
+ "DATA_LEN_NOT_EQUAL_TO_MOD_LEN\0"
+ "DATA_TOO_LARGE\0"
+ "DATA_TOO_LARGE_FOR_KEY_SIZE\0"
+ "DATA_TOO_LARGE_FOR_MODULUS\0"
+ "DATA_TOO_SMALL\0"
+ "DATA_TOO_SMALL_FOR_KEY_SIZE\0"
+ "DIGEST_TOO_BIG_FOR_RSA_KEY\0"
+ "D_E_NOT_CONGRUENT_TO_1\0"
+ "D_OUT_OF_RANGE\0"
+ "EMPTY_PUBLIC_KEY\0"
+ "FIRST_OCTET_INVALID\0"
+ "INCONSISTENT_SET_OF_CRT_VALUES\0"
+ "INTERNAL_ERROR\0"
+ "INVALID_MESSAGE_LENGTH\0"
+ "KEY_SIZE_TOO_SMALL\0"
+ "LAST_OCTET_INVALID\0"
+ "MUST_HAVE_AT_LEAST_TWO_PRIMES\0"
+ "NO_PUBLIC_EXPONENT\0"
+ "NULL_BEFORE_BLOCK_MISSING\0"
+ "N_NOT_EQUAL_P_Q\0"
+ "OAEP_DECODING_ERROR\0"
+ "ONLY_ONE_OF_P_Q_GIVEN\0"
+ "OUTPUT_BUFFER_TOO_SMALL\0"
+ "PADDING_CHECK_FAILED\0"
+ "PKCS_DECODING_ERROR\0"
+ "SLEN_CHECK_FAILED\0"
+ "SLEN_RECOVERY_FAILED\0"
+ "UNKNOWN_ALGORITHM_TYPE\0"
+ "UNKNOWN_PADDING_TYPE\0"
+ "VALUE_MISSING\0"
+ "WRONG_SIGNATURE_LENGTH\0"
+ "ALPN_MISMATCH_ON_EARLY_DATA\0"
+ "ALPS_MISMATCH_ON_EARLY_DATA\0"
+ "APPLICATION_DATA_INSTEAD_OF_HANDSHAKE\0"
+ "APPLICATION_DATA_ON_SHUTDOWN\0"
+ "APP_DATA_IN_HANDSHAKE\0"
+ "ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT\0"
+ "BAD_ALERT\0"
+ "BAD_CHANGE_CIPHER_SPEC\0"
+ "BAD_DATA_RETURNED_BY_CALLBACK\0"
+ "BAD_DH_P_LENGTH\0"
+ "BAD_DIGEST_LENGTH\0"
+ "BAD_ECC_CERT\0"
+ "BAD_ECPOINT\0"
+ "BAD_HANDSHAKE_RECORD\0"
+ "BAD_HELLO_REQUEST\0"
+ "BAD_LENGTH\0"
+ "BAD_PACKET_LENGTH\0"
+ "BAD_RSA_ENCRYPT\0"
+ "BAD_SRTP_MKI_VALUE\0"
+ "BAD_SRTP_PROTECTION_PROFILE_LIST\0"
+ "BAD_SSL_FILETYPE\0"
+ "BAD_WRITE_RETRY\0"
+ "BIO_NOT_SET\0"
+ "BLOCK_CIPHER_PAD_IS_WRONG\0"
+ "CANNOT_HAVE_BOTH_PRIVKEY_AND_METHOD\0"
+ "CANNOT_PARSE_LEAF_CERT\0"
+ "CA_DN_LENGTH_MISMATCH\0"
+ "CA_DN_TOO_LONG\0"
+ "CCS_RECEIVED_EARLY\0"
+ "CERTIFICATE_AND_PRIVATE_KEY_MISMATCH\0"
+ "CERTIFICATE_VERIFY_FAILED\0"
+ "CERT_CB_ERROR\0"
+ "CERT_DECOMPRESSION_FAILED\0"
+ "CERT_LENGTH_MISMATCH\0"
+ "CHANNEL_ID_NOT_P256\0"
+ "CHANNEL_ID_SIGNATURE_INVALID\0"
+ "CIPHER_MISMATCH_ON_EARLY_DATA\0"
+ "CIPHER_OR_HASH_UNAVAILABLE\0"
+ "CLIENTHELLO_PARSE_FAILED\0"
+ "CLIENTHELLO_TLSEXT\0"
+ "CONNECTION_REJECTED\0"
+ "CONNECTION_TYPE_NOT_SET\0"
+ "COULD_NOT_PARSE_HINTS\0"
+ "CUSTOM_EXTENSION_ERROR\0"
+ "DATA_LENGTH_TOO_LONG\0"
+ "DECRYPTION_FAILED\0"
+ "DECRYPTION_FAILED_OR_BAD_RECORD_MAC\0"
+ "DH_PUBLIC_VALUE_LENGTH_IS_WRONG\0"
+ "DH_P_TOO_LONG\0"
+ "DIGEST_CHECK_FAILED\0"
+ "DOWNGRADE_DETECTED\0"
+ "DTLS_MESSAGE_TOO_BIG\0"
+ "DUPLICATE_EXTENSION\0"
+ "DUPLICATE_KEY_SHARE\0"
+ "DUPLICATE_SIGNATURE_ALGORITHM\0"
+ "EARLY_DATA_NOT_IN_USE\0"
+ "ECC_CERT_NOT_FOR_SIGNING\0"
+ "ECH_REJECTED\0"
+ "ECH_SERVER_CONFIG_AND_PRIVATE_KEY_MISMATCH\0"
+ "ECH_SERVER_CONFIG_UNSUPPORTED_EXTENSION\0"
+ "ECH_SERVER_WOULD_HAVE_NO_RETRY_CONFIGS\0"
+ "EMPTY_HELLO_RETRY_REQUEST\0"
+ "EMS_STATE_INCONSISTENT\0"
+ "ENCRYPTED_LENGTH_TOO_LONG\0"
+ "ERROR_ADDING_EXTENSION\0"
+ "ERROR_IN_RECEIVED_CIPHER_LIST\0"
+ "ERROR_PARSING_EXTENSION\0"
+ "EXCESSIVE_MESSAGE_SIZE\0"
+ "EXCESS_HANDSHAKE_DATA\0"
+ "EXTRA_DATA_IN_MESSAGE\0"
+ "FRAGMENT_MISMATCH\0"
+ "GOT_NEXT_PROTO_WITHOUT_EXTENSION\0"
+ "HANDSHAKE_FAILURE_ON_CLIENT_HELLO\0"
+ "HANDSHAKE_NOT_COMPLETE\0"
+ "HTTPS_PROXY_REQUEST\0"
+ "HTTP_REQUEST\0"
+ "INAPPROPRIATE_FALLBACK\0"
+ "INCONSISTENT_CLIENT_HELLO\0"
+ "INCONSISTENT_ECH_NEGOTIATION\0"
+ "INVALID_ALPN_PROTOCOL\0"
+ "INVALID_ALPN_PROTOCOL_LIST\0"
+ "INVALID_ALPS_CODEPOINT\0"
+ "INVALID_CLIENT_HELLO_INNER\0"
+ "INVALID_COMMAND\0"
+ "INVALID_COMPRESSION_LIST\0"
+ "INVALID_DELEGATED_CREDENTIAL\0"
+ "INVALID_ECH_CONFIG_LIST\0"
+ "INVALID_ECH_PUBLIC_NAME\0"
+ "INVALID_MESSAGE\0"
+ "INVALID_OUTER_EXTENSION\0"
+ "INVALID_OUTER_RECORD_TYPE\0"
+ "INVALID_SCT_LIST\0"
+ "INVALID_SIGNATURE_ALGORITHM\0"
+ "INVALID_SSL_SESSION\0"
+ "INVALID_TICKET_KEYS_LENGTH\0"
+ "KEY_USAGE_BIT_INCORRECT\0"
+ "LENGTH_MISMATCH\0"
+ "MISSING_EXTENSION\0"
+ "MISSING_KEY_SHARE\0"
+ "MISSING_RSA_CERTIFICATE\0"
+ "MISSING_TMP_DH_KEY\0"
+ "MISSING_TMP_ECDH_KEY\0"
+ "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0"
+ "MTU_TOO_SMALL\0"
+ "NEGOTIATED_ALPS_WITHOUT_ALPN\0"
+ "NEGOTIATED_BOTH_NPN_AND_ALPN\0"
+ "NEGOTIATED_TB_WITHOUT_EMS_OR_RI\0"
+ "NESTED_GROUP\0"
+ "NO_APPLICATION_PROTOCOL\0"
+ "NO_CERTIFICATES_RETURNED\0"
+ "NO_CERTIFICATE_ASSIGNED\0"
+ "NO_CERTIFICATE_SET\0"
+ "NO_CIPHERS_AVAILABLE\0"
+ "NO_CIPHERS_PASSED\0"
+ "NO_CIPHERS_SPECIFIED\0"
+ "NO_CIPHER_MATCH\0"
+ "NO_COMMON_SIGNATURE_ALGORITHMS\0"
+ "NO_COMPRESSION_SPECIFIED\0"
+ "NO_GROUPS_SPECIFIED\0"
+ "NO_METHOD_SPECIFIED\0"
+ "NO_PRIVATE_KEY_ASSIGNED\0"
+ "NO_RENEGOTIATION\0"
+ "NO_REQUIRED_DIGEST\0"
+ "NO_SHARED_CIPHER\0"
+ "NO_SHARED_GROUP\0"
+ "NO_SUPPORTED_VERSIONS_ENABLED\0"
+ "NULL_SSL_CTX\0"
+ "NULL_SSL_METHOD_PASSED\0"
+ "OCSP_CB_ERROR\0"
+ "OLD_SESSION_CIPHER_NOT_RETURNED\0"
+ "OLD_SESSION_PRF_HASH_MISMATCH\0"
+ "OLD_SESSION_VERSION_NOT_RETURNED\0"
+ "PARSE_TLSEXT\0"
+ "PATH_TOO_LONG\0"
+ "PEER_DID_NOT_RETURN_A_CERTIFICATE\0"
+ "PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE\0"
+ "PRE_SHARED_KEY_MUST_BE_LAST\0"
+ "PRIVATE_KEY_OPERATION_FAILED\0"
+ "PROTOCOL_IS_SHUTDOWN\0"
+ "PSK_IDENTITY_BINDER_COUNT_MISMATCH\0"
+ "PSK_IDENTITY_NOT_FOUND\0"
+ "PSK_NO_CLIENT_CB\0"
+ "PSK_NO_SERVER_CB\0"
+ "QUIC_INTERNAL_ERROR\0"
+ "QUIC_TRANSPORT_PARAMETERS_MISCONFIGURED\0"
+ "READ_TIMEOUT_EXPIRED\0"
+ "RECORD_LENGTH_MISMATCH\0"
+ "RECORD_TOO_LARGE\0"
+ "RENEGOTIATION_EMS_MISMATCH\0"
+ "RENEGOTIATION_ENCODING_ERR\0"
+ "RENEGOTIATION_MISMATCH\0"
+ "REQUIRED_CIPHER_MISSING\0"
+ "RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION\0"
+ "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0"
+ "SCSV_RECEIVED_WHEN_RENEGOTIATING\0"
+ "SECOND_SERVERHELLO_VERSION_MISMATCH\0"
+ "SERVERHELLO_TLSEXT\0"
+ "SERVER_CERT_CHANGED\0"
+ "SERVER_ECHOED_INVALID_SESSION_ID\0"
+ "SESSION_ID_CONTEXT_UNINITIALIZED\0"
+ "SESSION_MAY_NOT_BE_CREATED\0"
+ "SHUTDOWN_WHILE_IN_INIT\0"
+ "SIGNATURE_ALGORITHMS_EXTENSION_SENT_BY_SERVER\0"
+ "SRTP_COULD_NOT_ALLOCATE_PROFILES\0"
+ "SRTP_UNKNOWN_PROTECTION_PROFILE\0"
+ "SSL3_EXT_INVALID_SERVERNAME\0"
+ "SSLV3_ALERT_BAD_CERTIFICATE\0"
+ "SSLV3_ALERT_BAD_RECORD_MAC\0"
+ "SSLV3_ALERT_CERTIFICATE_EXPIRED\0"
+ "SSLV3_ALERT_CERTIFICATE_REVOKED\0"
+ "SSLV3_ALERT_CERTIFICATE_UNKNOWN\0"
+ "SSLV3_ALERT_CLOSE_NOTIFY\0"
+ "SSLV3_ALERT_DECOMPRESSION_FAILURE\0"
+ "SSLV3_ALERT_HANDSHAKE_FAILURE\0"
+ "SSLV3_ALERT_ILLEGAL_PARAMETER\0"
+ "SSLV3_ALERT_NO_CERTIFICATE\0"
+ "SSLV3_ALERT_UNEXPECTED_MESSAGE\0"
+ "SSLV3_ALERT_UNSUPPORTED_CERTIFICATE\0"
+ "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0"
+ "SSL_HANDSHAKE_FAILURE\0"
+ "SSL_SESSION_ID_CONTEXT_TOO_LONG\0"
+ "SSL_SESSION_ID_TOO_LONG\0"
+ "TICKET_ENCRYPTION_FAILED\0"
+ "TLS13_DOWNGRADE\0"
+ "TLSV1_ALERT_ACCESS_DENIED\0"
+ "TLSV1_ALERT_BAD_CERTIFICATE_HASH_VALUE\0"
+ "TLSV1_ALERT_BAD_CERTIFICATE_STATUS_RESPONSE\0"
+ "TLSV1_ALERT_CERTIFICATE_REQUIRED\0"
+ "TLSV1_ALERT_CERTIFICATE_UNOBTAINABLE\0"
+ "TLSV1_ALERT_DECODE_ERROR\0"
+ "TLSV1_ALERT_DECRYPTION_FAILED\0"
+ "TLSV1_ALERT_DECRYPT_ERROR\0"
+ "TLSV1_ALERT_ECH_REQUIRED\0"
+ "TLSV1_ALERT_EXPORT_RESTRICTION\0"
+ "TLSV1_ALERT_INAPPROPRIATE_FALLBACK\0"
+ "TLSV1_ALERT_INSUFFICIENT_SECURITY\0"
+ "TLSV1_ALERT_INTERNAL_ERROR\0"
+ "TLSV1_ALERT_NO_APPLICATION_PROTOCOL\0"
+ "TLSV1_ALERT_NO_RENEGOTIATION\0"
+ "TLSV1_ALERT_PROTOCOL_VERSION\0"
+ "TLSV1_ALERT_RECORD_OVERFLOW\0"
+ "TLSV1_ALERT_UNKNOWN_CA\0"
+ "TLSV1_ALERT_UNKNOWN_PSK_IDENTITY\0"
+ "TLSV1_ALERT_UNRECOGNIZED_NAME\0"
+ "TLSV1_ALERT_UNSUPPORTED_EXTENSION\0"
+ "TLSV1_ALERT_USER_CANCELLED\0"
+ "TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST\0"
+ "TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG\0"
+ "TOO_MANY_EMPTY_FRAGMENTS\0"
+ "TOO_MANY_KEY_UPDATES\0"
+ "TOO_MANY_WARNING_ALERTS\0"
+ "TOO_MUCH_READ_EARLY_DATA\0"
+ "TOO_MUCH_SKIPPED_EARLY_DATA\0"
+ "UNABLE_TO_FIND_ECDH_PARAMETERS\0"
+ "UNCOMPRESSED_CERT_TOO_LARGE\0"
+ "UNEXPECTED_COMPATIBILITY_MODE\0"
+ "UNEXPECTED_EXTENSION\0"
+ "UNEXPECTED_EXTENSION_ON_EARLY_DATA\0"
+ "UNEXPECTED_MESSAGE\0"
+ "UNEXPECTED_OPERATOR_IN_GROUP\0"
+ "UNEXPECTED_RECORD\0"
+ "UNKNOWN_ALERT_TYPE\0"
+ "UNKNOWN_CERTIFICATE_TYPE\0"
+ "UNKNOWN_CERT_COMPRESSION_ALG\0"
+ "UNKNOWN_CIPHER_RETURNED\0"
+ "UNKNOWN_CIPHER_TYPE\0"
+ "UNKNOWN_KEY_EXCHANGE_TYPE\0"
+ "UNKNOWN_PROTOCOL\0"
+ "UNKNOWN_SSL_VERSION\0"
+ "UNKNOWN_STATE\0"
+ "UNSAFE_LEGACY_RENEGOTIATION_DISABLED\0"
+ "UNSUPPORTED_COMPRESSION_ALGORITHM\0"
+ "UNSUPPORTED_ECH_SERVER_CONFIG\0"
+ "UNSUPPORTED_ELLIPTIC_CURVE\0"
+ "UNSUPPORTED_PROTOCOL\0"
+ "UNSUPPORTED_PROTOCOL_FOR_CUSTOM_KEY\0"
+ "WRONG_CERTIFICATE_TYPE\0"
+ "WRONG_CIPHER_RETURNED\0"
+ "WRONG_CURVE\0"
+ "WRONG_ENCRYPTION_LEVEL_RECEIVED\0"
+ "WRONG_MESSAGE_TYPE\0"
+ "WRONG_SIGNATURE_TYPE\0"
+ "WRONG_SSL_VERSION\0"
+ "WRONG_VERSION_NUMBER\0"
+ "WRONG_VERSION_ON_EARLY_DATA\0"
+ "X509_LIB\0"
+ "X509_VERIFICATION_SETUP_PROBLEMS\0"
+ "BAD_VALIDITY_CHECK\0"
+ "DECODE_FAILURE\0"
+ "INVALID_KEY_ID\0"
+ "INVALID_METADATA\0"
+ "INVALID_METADATA_KEY\0"
+ "INVALID_PROOF\0"
+ "INVALID_TOKEN\0"
+ "NO_KEYS_CONFIGURED\0"
+ "NO_SRR_KEY_CONFIGURED\0"
+ "OVER_BATCHSIZE\0"
+ "SRR_SIGNATURE_ERROR\0"
+ "TOO_MANY_KEYS\0"
+ "AKID_MISMATCH\0"
+ "BAD_X509_FILETYPE\0"
+ "BASE64_DECODE_ERROR\0"
+ "CANT_CHECK_DH_KEY\0"
+ "CERT_ALREADY_IN_HASH_TABLE\0"
+ "CRL_ALREADY_DELTA\0"
+ "CRL_VERIFY_FAILURE\0"
+ "DELTA_CRL_WITHOUT_CRL_NUMBER\0"
+ "IDP_MISMATCH\0"
+ "INVALID_DIRECTORY\0"
+ "INVALID_FIELD_FOR_VERSION\0"
+ "INVALID_FIELD_NAME\0"
+ "INVALID_PARAMETER\0"
+ "INVALID_POLICY_EXTENSION\0"
+ "INVALID_PSS_PARAMETERS\0"
+ "INVALID_TRUST\0"
+ "INVALID_VERSION\0"
+ "ISSUER_MISMATCH\0"
+ "KEY_TYPE_MISMATCH\0"
+ "KEY_VALUES_MISMATCH\0"
+ "LOADING_CERT_DIR\0"
+ "LOADING_DEFAULTS\0"
+ "NAME_TOO_LONG\0"
+ "NEWER_CRL_NOT_NEWER\0"
+ "NO_CERTIFICATE_FOUND\0"
+ "NO_CERTIFICATE_OR_CRL_FOUND\0"
+ "NO_CERT_SET_FOR_US_TO_VERIFY\0"
+ "NO_CRL_FOUND\0"
+ "NO_CRL_NUMBER\0"
+ "PUBLIC_KEY_DECODE_ERROR\0"
+ "PUBLIC_KEY_ENCODE_ERROR\0"
+ "SHOULD_RETRY\0"
+ "SIGNATURE_ALGORITHM_MISMATCH\0"
+ "UNKNOWN_KEY_TYPE\0"
+ "UNKNOWN_PURPOSE_ID\0"
+ "UNKNOWN_TRUST_ID\0"
+ "WRONG_LOOKUP_TYPE\0"
+ "BAD_IP_ADDRESS\0"
+ "BAD_OBJECT\0"
+ "BN_DEC2BN_ERROR\0"
+ "BN_TO_ASN1_INTEGER_ERROR\0"
+ "CANNOT_FIND_FREE_FUNCTION\0"
+ "DIRNAME_ERROR\0"
+ "DISTPOINT_ALREADY_SET\0"
+ "DUPLICATE_ZONE_ID\0"
+ "ERROR_CONVERTING_ZONE\0"
+ "ERROR_CREATING_EXTENSION\0"
+ "ERROR_IN_EXTENSION\0"
+ "EXPECTED_A_SECTION_NAME\0"
+ "EXTENSION_EXISTS\0"
+ "EXTENSION_NAME_ERROR\0"
+ "EXTENSION_NOT_FOUND\0"
+ "EXTENSION_SETTING_NOT_SUPPORTED\0"
+ "EXTENSION_VALUE_ERROR\0"
+ "ILLEGAL_EMPTY_EXTENSION\0"
+ "ILLEGAL_HEX_DIGIT\0"
+ "INCORRECT_POLICY_SYNTAX_TAG\0"
+ "INVALID_BOOLEAN_STRING\0"
+ "INVALID_EXTENSION_STRING\0"
+ "INVALID_MULTIPLE_RDNS\0"
+ "INVALID_NAME\0"
+ "INVALID_NULL_ARGUMENT\0"
+ "INVALID_NULL_NAME\0"
+ "INVALID_NULL_VALUE\0"
+ "INVALID_NUMBERS\0"
+ "INVALID_OBJECT_IDENTIFIER\0"
+ "INVALID_OPTION\0"
+ "INVALID_POLICY_IDENTIFIER\0"
+ "INVALID_PROXY_POLICY_SETTING\0"
+ "INVALID_PURPOSE\0"
+ "INVALID_SECTION\0"
+ "INVALID_SYNTAX\0"
+ "INVALID_VALUE\0"
+ "ISSUER_DECODE_ERROR\0"
+ "NEED_ORGANIZATION_AND_NUMBERS\0"
+ "NO_CONFIG_DATABASE\0"
+ "NO_ISSUER_CERTIFICATE\0"
+ "NO_ISSUER_DETAILS\0"
+ "NO_POLICY_IDENTIFIER\0"
+ "NO_PROXY_CERT_POLICY_LANGUAGE_DEFINED\0"
+ "NO_PUBLIC_KEY\0"
+ "NO_SUBJECT_DETAILS\0"
+ "ODD_NUMBER_OF_DIGITS\0"
+ "OPERATION_NOT_DEFINED\0"
+ "OTHERNAME_ERROR\0"
+ "POLICY_LANGUAGE_ALREADY_DEFINED\0"
+ "POLICY_PATH_LENGTH\0"
+ "POLICY_PATH_LENGTH_ALREADY_DEFINED\0"
+ "POLICY_WHEN_PROXY_LANGUAGE_REQUIRES_NO_POLICY\0"
+ "SECTION_NOT_FOUND\0"
+ "TRAILING_DATA_IN_EXTENSION\0"
+ "UNABLE_TO_GET_ISSUER_DETAILS\0"
+ "UNABLE_TO_GET_ISSUER_KEYID\0"
+ "UNKNOWN_BIT_STRING_ARGUMENT\0"
+ "UNKNOWN_EXTENSION\0"
+ "UNKNOWN_EXTENSION_NAME\0"
+ "UNKNOWN_OPTION\0"
+ "UNSUPPORTED_OPTION\0"
+ "USER_TOO_LONG\0"
+ "";
+
diff --git a/gen/sources.cmake b/gen/sources.cmake
new file mode 100644
index 0000000..03c5d3a
--- /dev/null
+++ b/gen/sources.cmake
@@ -0,0 +1,210 @@
+# Copyright (c) 2024, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+# Generated by go ./util/pregenerate. Do not edit manually.
+
+set(
+ BCM_SOURCES_ASM
+
+ gen/bcm/aesni-gcm-x86_64-apple.S
+ gen/bcm/aesni-gcm-x86_64-linux.S
+ gen/bcm/aesni-x86-apple.S
+ gen/bcm/aesni-x86-linux.S
+ gen/bcm/aesni-x86_64-apple.S
+ gen/bcm/aesni-x86_64-linux.S
+ gen/bcm/aesv8-armv7-linux.S
+ gen/bcm/aesv8-armv8-apple.S
+ gen/bcm/aesv8-armv8-linux.S
+ gen/bcm/aesv8-armv8-win.S
+ gen/bcm/aesv8-gcm-armv8-apple.S
+ gen/bcm/aesv8-gcm-armv8-linux.S
+ gen/bcm/aesv8-gcm-armv8-win.S
+ gen/bcm/armv4-mont-linux.S
+ gen/bcm/armv8-mont-apple.S
+ gen/bcm/armv8-mont-linux.S
+ gen/bcm/armv8-mont-win.S
+ gen/bcm/bn-586-apple.S
+ gen/bcm/bn-586-linux.S
+ gen/bcm/bn-armv8-apple.S
+ gen/bcm/bn-armv8-linux.S
+ gen/bcm/bn-armv8-win.S
+ gen/bcm/bsaes-armv7-linux.S
+ gen/bcm/co-586-apple.S
+ gen/bcm/co-586-linux.S
+ gen/bcm/ghash-armv4-linux.S
+ gen/bcm/ghash-neon-armv8-apple.S
+ gen/bcm/ghash-neon-armv8-linux.S
+ gen/bcm/ghash-neon-armv8-win.S
+ gen/bcm/ghash-ssse3-x86-apple.S
+ gen/bcm/ghash-ssse3-x86-linux.S
+ gen/bcm/ghash-ssse3-x86_64-apple.S
+ gen/bcm/ghash-ssse3-x86_64-linux.S
+ gen/bcm/ghash-x86-apple.S
+ gen/bcm/ghash-x86-linux.S
+ gen/bcm/ghash-x86_64-apple.S
+ gen/bcm/ghash-x86_64-linux.S
+ gen/bcm/ghashv8-armv7-linux.S
+ gen/bcm/ghashv8-armv8-apple.S
+ gen/bcm/ghashv8-armv8-linux.S
+ gen/bcm/ghashv8-armv8-win.S
+ gen/bcm/md5-586-apple.S
+ gen/bcm/md5-586-linux.S
+ gen/bcm/md5-x86_64-apple.S
+ gen/bcm/md5-x86_64-linux.S
+ gen/bcm/p256-armv8-asm-apple.S
+ gen/bcm/p256-armv8-asm-linux.S
+ gen/bcm/p256-armv8-asm-win.S
+ gen/bcm/p256-x86_64-asm-apple.S
+ gen/bcm/p256-x86_64-asm-linux.S
+ gen/bcm/p256_beeu-armv8-asm-apple.S
+ gen/bcm/p256_beeu-armv8-asm-linux.S
+ gen/bcm/p256_beeu-armv8-asm-win.S
+ gen/bcm/p256_beeu-x86_64-asm-apple.S
+ gen/bcm/p256_beeu-x86_64-asm-linux.S
+ gen/bcm/rdrand-x86_64-apple.S
+ gen/bcm/rdrand-x86_64-linux.S
+ gen/bcm/rsaz-avx2-apple.S
+ gen/bcm/rsaz-avx2-linux.S
+ gen/bcm/sha1-586-apple.S
+ gen/bcm/sha1-586-linux.S
+ gen/bcm/sha1-armv4-large-linux.S
+ gen/bcm/sha1-armv8-apple.S
+ gen/bcm/sha1-armv8-linux.S
+ gen/bcm/sha1-armv8-win.S
+ gen/bcm/sha1-x86_64-apple.S
+ gen/bcm/sha1-x86_64-linux.S
+ gen/bcm/sha256-586-apple.S
+ gen/bcm/sha256-586-linux.S
+ gen/bcm/sha256-armv4-linux.S
+ gen/bcm/sha256-armv8-apple.S
+ gen/bcm/sha256-armv8-linux.S
+ gen/bcm/sha256-armv8-win.S
+ gen/bcm/sha256-x86_64-apple.S
+ gen/bcm/sha256-x86_64-linux.S
+ gen/bcm/sha512-586-apple.S
+ gen/bcm/sha512-586-linux.S
+ gen/bcm/sha512-armv4-linux.S
+ gen/bcm/sha512-armv8-apple.S
+ gen/bcm/sha512-armv8-linux.S
+ gen/bcm/sha512-armv8-win.S
+ gen/bcm/sha512-x86_64-apple.S
+ gen/bcm/sha512-x86_64-linux.S
+ gen/bcm/vpaes-armv7-linux.S
+ gen/bcm/vpaes-armv8-apple.S
+ gen/bcm/vpaes-armv8-linux.S
+ gen/bcm/vpaes-armv8-win.S
+ gen/bcm/vpaes-x86-apple.S
+ gen/bcm/vpaes-x86-linux.S
+ gen/bcm/vpaes-x86_64-apple.S
+ gen/bcm/vpaes-x86_64-linux.S
+ gen/bcm/x86-mont-apple.S
+ gen/bcm/x86-mont-linux.S
+ gen/bcm/x86_64-mont-apple.S
+ gen/bcm/x86_64-mont-linux.S
+ gen/bcm/x86_64-mont5-apple.S
+ gen/bcm/x86_64-mont5-linux.S
+)
+
+set(
+ BCM_SOURCES_NASM
+
+ gen/bcm/aesni-gcm-x86_64-win.asm
+ gen/bcm/aesni-x86-win.asm
+ gen/bcm/aesni-x86_64-win.asm
+ gen/bcm/bn-586-win.asm
+ gen/bcm/co-586-win.asm
+ gen/bcm/ghash-ssse3-x86-win.asm
+ gen/bcm/ghash-ssse3-x86_64-win.asm
+ gen/bcm/ghash-x86-win.asm
+ gen/bcm/ghash-x86_64-win.asm
+ gen/bcm/md5-586-win.asm
+ gen/bcm/md5-x86_64-win.asm
+ gen/bcm/p256-x86_64-asm-win.asm
+ gen/bcm/p256_beeu-x86_64-asm-win.asm
+ gen/bcm/rdrand-x86_64-win.asm
+ gen/bcm/rsaz-avx2-win.asm
+ gen/bcm/sha1-586-win.asm
+ gen/bcm/sha1-x86_64-win.asm
+ gen/bcm/sha256-586-win.asm
+ gen/bcm/sha256-x86_64-win.asm
+ gen/bcm/sha512-586-win.asm
+ gen/bcm/sha512-x86_64-win.asm
+ gen/bcm/vpaes-x86-win.asm
+ gen/bcm/vpaes-x86_64-win.asm
+ gen/bcm/x86-mont-win.asm
+ gen/bcm/x86_64-mont-win.asm
+ gen/bcm/x86_64-mont5-win.asm
+)
+
+set(
+ CRYPTO_SOURCES
+
+ gen/crypto/err_data.c
+)
+
+set(
+ CRYPTO_SOURCES_ASM
+
+ crypto/curve25519/asm/x25519-asm-arm.S
+ crypto/hrss/asm/poly_rq_mul.S
+ crypto/poly1305/poly1305_arm_asm.S
+ gen/crypto/aes128gcmsiv-x86_64-apple.S
+ gen/crypto/aes128gcmsiv-x86_64-linux.S
+ gen/crypto/chacha-armv4-linux.S
+ gen/crypto/chacha-armv8-apple.S
+ gen/crypto/chacha-armv8-linux.S
+ gen/crypto/chacha-armv8-win.S
+ gen/crypto/chacha-x86-apple.S
+ gen/crypto/chacha-x86-linux.S
+ gen/crypto/chacha-x86_64-apple.S
+ gen/crypto/chacha-x86_64-linux.S
+ gen/crypto/chacha20_poly1305_armv8-apple.S
+ gen/crypto/chacha20_poly1305_armv8-linux.S
+ gen/crypto/chacha20_poly1305_armv8-win.S
+ gen/crypto/chacha20_poly1305_x86_64-apple.S
+ gen/crypto/chacha20_poly1305_x86_64-linux.S
+ third_party/fiat/asm/fiat_curve25519_adx_mul.S
+ third_party/fiat/asm/fiat_curve25519_adx_square.S
+ third_party/fiat/asm/fiat_p256_adx_mul.S
+ third_party/fiat/asm/fiat_p256_adx_sqr.S
+)
+
+set(
+ CRYPTO_SOURCES_NASM
+
+ gen/crypto/aes128gcmsiv-x86_64-win.asm
+ gen/crypto/chacha-x86-win.asm
+ gen/crypto/chacha-x86_64-win.asm
+ gen/crypto/chacha20_poly1305_x86_64-win.asm
+)
+
+set(
+ TEST_SUPPORT_SOURCES_ASM
+
+ gen/test_support/trampoline-armv4-linux.S
+ gen/test_support/trampoline-armv8-apple.S
+ gen/test_support/trampoline-armv8-linux.S
+ gen/test_support/trampoline-armv8-win.S
+ gen/test_support/trampoline-x86-apple.S
+ gen/test_support/trampoline-x86-linux.S
+ gen/test_support/trampoline-x86_64-apple.S
+ gen/test_support/trampoline-x86_64-linux.S
+)
+
+set(
+ TEST_SUPPORT_SOURCES_NASM
+
+ gen/test_support/trampoline-x86-win.asm
+ gen/test_support/trampoline-x86_64-win.asm
+)
diff --git a/gen/sources.json b/gen/sources.json
new file mode 100644
index 0000000..785ac73
--- /dev/null
+++ b/gen/sources.json
@@ -0,0 +1,182 @@
+{
+ "bcm": {
+ "asm": [
+ "gen/bcm/aesni-gcm-x86_64-apple.S",
+ "gen/bcm/aesni-gcm-x86_64-linux.S",
+ "gen/bcm/aesni-x86-apple.S",
+ "gen/bcm/aesni-x86-linux.S",
+ "gen/bcm/aesni-x86_64-apple.S",
+ "gen/bcm/aesni-x86_64-linux.S",
+ "gen/bcm/aesv8-armv7-linux.S",
+ "gen/bcm/aesv8-armv8-apple.S",
+ "gen/bcm/aesv8-armv8-linux.S",
+ "gen/bcm/aesv8-armv8-win.S",
+ "gen/bcm/aesv8-gcm-armv8-apple.S",
+ "gen/bcm/aesv8-gcm-armv8-linux.S",
+ "gen/bcm/aesv8-gcm-armv8-win.S",
+ "gen/bcm/armv4-mont-linux.S",
+ "gen/bcm/armv8-mont-apple.S",
+ "gen/bcm/armv8-mont-linux.S",
+ "gen/bcm/armv8-mont-win.S",
+ "gen/bcm/bn-586-apple.S",
+ "gen/bcm/bn-586-linux.S",
+ "gen/bcm/bn-armv8-apple.S",
+ "gen/bcm/bn-armv8-linux.S",
+ "gen/bcm/bn-armv8-win.S",
+ "gen/bcm/bsaes-armv7-linux.S",
+ "gen/bcm/co-586-apple.S",
+ "gen/bcm/co-586-linux.S",
+ "gen/bcm/ghash-armv4-linux.S",
+ "gen/bcm/ghash-neon-armv8-apple.S",
+ "gen/bcm/ghash-neon-armv8-linux.S",
+ "gen/bcm/ghash-neon-armv8-win.S",
+ "gen/bcm/ghash-ssse3-x86-apple.S",
+ "gen/bcm/ghash-ssse3-x86-linux.S",
+ "gen/bcm/ghash-ssse3-x86_64-apple.S",
+ "gen/bcm/ghash-ssse3-x86_64-linux.S",
+ "gen/bcm/ghash-x86-apple.S",
+ "gen/bcm/ghash-x86-linux.S",
+ "gen/bcm/ghash-x86_64-apple.S",
+ "gen/bcm/ghash-x86_64-linux.S",
+ "gen/bcm/ghashv8-armv7-linux.S",
+ "gen/bcm/ghashv8-armv8-apple.S",
+ "gen/bcm/ghashv8-armv8-linux.S",
+ "gen/bcm/ghashv8-armv8-win.S",
+ "gen/bcm/md5-586-apple.S",
+ "gen/bcm/md5-586-linux.S",
+ "gen/bcm/md5-x86_64-apple.S",
+ "gen/bcm/md5-x86_64-linux.S",
+ "gen/bcm/p256-armv8-asm-apple.S",
+ "gen/bcm/p256-armv8-asm-linux.S",
+ "gen/bcm/p256-armv8-asm-win.S",
+ "gen/bcm/p256-x86_64-asm-apple.S",
+ "gen/bcm/p256-x86_64-asm-linux.S",
+ "gen/bcm/p256_beeu-armv8-asm-apple.S",
+ "gen/bcm/p256_beeu-armv8-asm-linux.S",
+ "gen/bcm/p256_beeu-armv8-asm-win.S",
+ "gen/bcm/p256_beeu-x86_64-asm-apple.S",
+ "gen/bcm/p256_beeu-x86_64-asm-linux.S",
+ "gen/bcm/rdrand-x86_64-apple.S",
+ "gen/bcm/rdrand-x86_64-linux.S",
+ "gen/bcm/rsaz-avx2-apple.S",
+ "gen/bcm/rsaz-avx2-linux.S",
+ "gen/bcm/sha1-586-apple.S",
+ "gen/bcm/sha1-586-linux.S",
+ "gen/bcm/sha1-armv4-large-linux.S",
+ "gen/bcm/sha1-armv8-apple.S",
+ "gen/bcm/sha1-armv8-linux.S",
+ "gen/bcm/sha1-armv8-win.S",
+ "gen/bcm/sha1-x86_64-apple.S",
+ "gen/bcm/sha1-x86_64-linux.S",
+ "gen/bcm/sha256-586-apple.S",
+ "gen/bcm/sha256-586-linux.S",
+ "gen/bcm/sha256-armv4-linux.S",
+ "gen/bcm/sha256-armv8-apple.S",
+ "gen/bcm/sha256-armv8-linux.S",
+ "gen/bcm/sha256-armv8-win.S",
+ "gen/bcm/sha256-x86_64-apple.S",
+ "gen/bcm/sha256-x86_64-linux.S",
+ "gen/bcm/sha512-586-apple.S",
+ "gen/bcm/sha512-586-linux.S",
+ "gen/bcm/sha512-armv4-linux.S",
+ "gen/bcm/sha512-armv8-apple.S",
+ "gen/bcm/sha512-armv8-linux.S",
+ "gen/bcm/sha512-armv8-win.S",
+ "gen/bcm/sha512-x86_64-apple.S",
+ "gen/bcm/sha512-x86_64-linux.S",
+ "gen/bcm/vpaes-armv7-linux.S",
+ "gen/bcm/vpaes-armv8-apple.S",
+ "gen/bcm/vpaes-armv8-linux.S",
+ "gen/bcm/vpaes-armv8-win.S",
+ "gen/bcm/vpaes-x86-apple.S",
+ "gen/bcm/vpaes-x86-linux.S",
+ "gen/bcm/vpaes-x86_64-apple.S",
+ "gen/bcm/vpaes-x86_64-linux.S",
+ "gen/bcm/x86-mont-apple.S",
+ "gen/bcm/x86-mont-linux.S",
+ "gen/bcm/x86_64-mont-apple.S",
+ "gen/bcm/x86_64-mont-linux.S",
+ "gen/bcm/x86_64-mont5-apple.S",
+ "gen/bcm/x86_64-mont5-linux.S"
+ ],
+ "nasm": [
+ "gen/bcm/aesni-gcm-x86_64-win.asm",
+ "gen/bcm/aesni-x86-win.asm",
+ "gen/bcm/aesni-x86_64-win.asm",
+ "gen/bcm/bn-586-win.asm",
+ "gen/bcm/co-586-win.asm",
+ "gen/bcm/ghash-ssse3-x86-win.asm",
+ "gen/bcm/ghash-ssse3-x86_64-win.asm",
+ "gen/bcm/ghash-x86-win.asm",
+ "gen/bcm/ghash-x86_64-win.asm",
+ "gen/bcm/md5-586-win.asm",
+ "gen/bcm/md5-x86_64-win.asm",
+ "gen/bcm/p256-x86_64-asm-win.asm",
+ "gen/bcm/p256_beeu-x86_64-asm-win.asm",
+ "gen/bcm/rdrand-x86_64-win.asm",
+ "gen/bcm/rsaz-avx2-win.asm",
+ "gen/bcm/sha1-586-win.asm",
+ "gen/bcm/sha1-x86_64-win.asm",
+ "gen/bcm/sha256-586-win.asm",
+ "gen/bcm/sha256-x86_64-win.asm",
+ "gen/bcm/sha512-586-win.asm",
+ "gen/bcm/sha512-x86_64-win.asm",
+ "gen/bcm/vpaes-x86-win.asm",
+ "gen/bcm/vpaes-x86_64-win.asm",
+ "gen/bcm/x86-mont-win.asm",
+ "gen/bcm/x86_64-mont-win.asm",
+ "gen/bcm/x86_64-mont5-win.asm"
+ ]
+ },
+ "crypto": {
+ "srcs": [
+ "gen/crypto/err_data.c"
+ ],
+ "asm": [
+ "crypto/curve25519/asm/x25519-asm-arm.S",
+ "crypto/hrss/asm/poly_rq_mul.S",
+ "crypto/poly1305/poly1305_arm_asm.S",
+ "gen/crypto/aes128gcmsiv-x86_64-apple.S",
+ "gen/crypto/aes128gcmsiv-x86_64-linux.S",
+ "gen/crypto/chacha-armv4-linux.S",
+ "gen/crypto/chacha-armv8-apple.S",
+ "gen/crypto/chacha-armv8-linux.S",
+ "gen/crypto/chacha-armv8-win.S",
+ "gen/crypto/chacha-x86-apple.S",
+ "gen/crypto/chacha-x86-linux.S",
+ "gen/crypto/chacha-x86_64-apple.S",
+ "gen/crypto/chacha-x86_64-linux.S",
+ "gen/crypto/chacha20_poly1305_armv8-apple.S",
+ "gen/crypto/chacha20_poly1305_armv8-linux.S",
+ "gen/crypto/chacha20_poly1305_armv8-win.S",
+ "gen/crypto/chacha20_poly1305_x86_64-apple.S",
+ "gen/crypto/chacha20_poly1305_x86_64-linux.S",
+ "third_party/fiat/asm/fiat_curve25519_adx_mul.S",
+ "third_party/fiat/asm/fiat_curve25519_adx_square.S",
+ "third_party/fiat/asm/fiat_p256_adx_mul.S",
+ "third_party/fiat/asm/fiat_p256_adx_sqr.S"
+ ],
+ "nasm": [
+ "gen/crypto/aes128gcmsiv-x86_64-win.asm",
+ "gen/crypto/chacha-x86-win.asm",
+ "gen/crypto/chacha-x86_64-win.asm",
+ "gen/crypto/chacha20_poly1305_x86_64-win.asm"
+ ]
+ },
+ "test_support": {
+ "asm": [
+ "gen/test_support/trampoline-armv4-linux.S",
+ "gen/test_support/trampoline-armv8-apple.S",
+ "gen/test_support/trampoline-armv8-linux.S",
+ "gen/test_support/trampoline-armv8-win.S",
+ "gen/test_support/trampoline-x86-apple.S",
+ "gen/test_support/trampoline-x86-linux.S",
+ "gen/test_support/trampoline-x86_64-apple.S",
+ "gen/test_support/trampoline-x86_64-linux.S"
+ ],
+ "nasm": [
+ "gen/test_support/trampoline-x86-win.asm",
+ "gen/test_support/trampoline-x86_64-win.asm"
+ ]
+ }
+}
\ No newline at end of file
diff --git a/gen/test_support/trampoline-armv4-linux.S b/gen/test_support/trampoline-armv4-linux.S
new file mode 100644
index 0000000..34a2819
--- /dev/null
+++ b/gen/test_support/trampoline-armv4-linux.S
@@ -0,0 +1,368 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__)
+.syntax unified
+
+.arch armv7-a
+.fpu vfp
+
+.text
+
+@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+@ with |argv|, then saves the callee-saved registers into |state|. It returns
+@ the result of |func|. The |unwind| argument is unused.
+@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
+@ const uint32_t *argv, size_t argc,
+@ int unwind);
+.type abi_test_trampoline, %function
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.align 4
+abi_test_trampoline:
+ @ Save parameters and all callee-saved registers. For convenience, we
+ @ save r9 on iOS even though it's volatile.
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
+
+ @ Reserve stack space for six (10-4) stack parameters, plus an extra 4
+ @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
+ sub sp, sp, #28
+
+ @ Every register in AAPCS is either non-volatile or a parameter (except
+ @ r9 on iOS), so this code, by the actual call, loses all its scratch
+ @ registers. First fill in stack parameters while there are registers
+ @ to spare.
+ cmp r3, #4
+ bls .Lstack_args_done
+ mov r4, sp @ r4 is the output pointer.
+ add r5, r2, r3, lsl #2 @ Set r5 to the end of argv.
+ add r2, r2, #16 @ Skip four arguments.
+.Lstack_args_loop:
+ ldr r6, [r2], #4
+ cmp r2, r5
+ str r6, [r4], #4
+ bne .Lstack_args_loop
+
+.Lstack_args_done:
+ @ Load registers from |r1|.
+ vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+ @ r9 is not volatile on iOS.
+ ldmia r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+ ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+ @ Load register parameters. This uses up our remaining registers, so we
+ @ repurpose lr as scratch space.
+ ldr r3, [sp, #40] @ Reload argc.
+ ldr lr, [sp, #36] @ .Load argv into lr.
+ cmp r3, #3
+ bhi .Larg_r3
+ beq .Larg_r2
+ cmp r3, #1
+ bhi .Larg_r1
+ beq .Larg_r0
+ b .Largs_done
+
+.Larg_r3:
+ ldr r3, [lr, #12] @ argv[3]
+.Larg_r2:
+ ldr r2, [lr, #8] @ argv[2]
+.Larg_r1:
+ ldr r1, [lr, #4] @ argv[1]
+.Larg_r0:
+ ldr r0, [lr] @ argv[0]
+.Largs_done:
+
+ @ With every other register in use, load the function pointer into lr
+ @ and call the function.
+ ldr lr, [sp, #28]
+ blx lr
+
+ @ r1-r3 are free for use again. The trampoline only supports
+ @ single-return functions. Pass r4-r11 to the caller.
+ ldr r1, [sp, #32]
+ vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+ @ r9 is not volatile on iOS.
+ stmia r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+ stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+ @ Unwind the stack and restore registers.
+ add sp, sp, #44 @ 44 = 28+16
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above).
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ bx lr
+.size abi_test_trampoline,.-abi_test_trampoline
+.type abi_test_clobber_r0, %function
+.globl abi_test_clobber_r0
+.hidden abi_test_clobber_r0
+.align 4
+abi_test_clobber_r0:
+ mov r0, #0
+ bx lr
+.size abi_test_clobber_r0,.-abi_test_clobber_r0
+.type abi_test_clobber_r1, %function
+.globl abi_test_clobber_r1
+.hidden abi_test_clobber_r1
+.align 4
+abi_test_clobber_r1:
+ mov r1, #0
+ bx lr
+.size abi_test_clobber_r1,.-abi_test_clobber_r1
+.type abi_test_clobber_r2, %function
+.globl abi_test_clobber_r2
+.hidden abi_test_clobber_r2
+.align 4
+abi_test_clobber_r2:
+ mov r2, #0
+ bx lr
+.size abi_test_clobber_r2,.-abi_test_clobber_r2
+.type abi_test_clobber_r3, %function
+.globl abi_test_clobber_r3
+.hidden abi_test_clobber_r3
+.align 4
+abi_test_clobber_r3:
+ mov r3, #0
+ bx lr
+.size abi_test_clobber_r3,.-abi_test_clobber_r3
+.type abi_test_clobber_r4, %function
+.globl abi_test_clobber_r4
+.hidden abi_test_clobber_r4
+.align 4
+abi_test_clobber_r4:
+ mov r4, #0
+ bx lr
+.size abi_test_clobber_r4,.-abi_test_clobber_r4
+.type abi_test_clobber_r5, %function
+.globl abi_test_clobber_r5
+.hidden abi_test_clobber_r5
+.align 4
+abi_test_clobber_r5:
+ mov r5, #0
+ bx lr
+.size abi_test_clobber_r5,.-abi_test_clobber_r5
+.type abi_test_clobber_r6, %function
+.globl abi_test_clobber_r6
+.hidden abi_test_clobber_r6
+.align 4
+abi_test_clobber_r6:
+ mov r6, #0
+ bx lr
+.size abi_test_clobber_r6,.-abi_test_clobber_r6
+.type abi_test_clobber_r7, %function
+.globl abi_test_clobber_r7
+.hidden abi_test_clobber_r7
+.align 4
+abi_test_clobber_r7:
+ mov r7, #0
+ bx lr
+.size abi_test_clobber_r7,.-abi_test_clobber_r7
+.type abi_test_clobber_r8, %function
+.globl abi_test_clobber_r8
+.hidden abi_test_clobber_r8
+.align 4
+abi_test_clobber_r8:
+ mov r8, #0
+ bx lr
+.size abi_test_clobber_r8,.-abi_test_clobber_r8
+.type abi_test_clobber_r9, %function
+.globl abi_test_clobber_r9
+.hidden abi_test_clobber_r9
+.align 4
+abi_test_clobber_r9:
+ mov r9, #0
+ bx lr
+.size abi_test_clobber_r9,.-abi_test_clobber_r9
+.type abi_test_clobber_r10, %function
+.globl abi_test_clobber_r10
+.hidden abi_test_clobber_r10
+.align 4
+abi_test_clobber_r10:
+ mov r10, #0
+ bx lr
+.size abi_test_clobber_r10,.-abi_test_clobber_r10
+.type abi_test_clobber_r11, %function
+.globl abi_test_clobber_r11
+.hidden abi_test_clobber_r11
+.align 4
+abi_test_clobber_r11:
+ mov r11, #0
+ bx lr
+.size abi_test_clobber_r11,.-abi_test_clobber_r11
+.type abi_test_clobber_r12, %function
+.globl abi_test_clobber_r12
+.hidden abi_test_clobber_r12
+.align 4
+abi_test_clobber_r12:
+ mov r12, #0
+ bx lr
+.size abi_test_clobber_r12,.-abi_test_clobber_r12
+.type abi_test_clobber_d0, %function
+.globl abi_test_clobber_d0
+.hidden abi_test_clobber_d0
+.align 4
+abi_test_clobber_d0:
+ mov r0, #0
+ vmov s0, r0
+ vmov s1, r0
+ bx lr
+.size abi_test_clobber_d0,.-abi_test_clobber_d0
+.type abi_test_clobber_d1, %function
+.globl abi_test_clobber_d1
+.hidden abi_test_clobber_d1
+.align 4
+abi_test_clobber_d1:
+ mov r0, #0
+ vmov s2, r0
+ vmov s3, r0
+ bx lr
+.size abi_test_clobber_d1,.-abi_test_clobber_d1
+.type abi_test_clobber_d2, %function
+.globl abi_test_clobber_d2
+.hidden abi_test_clobber_d2
+.align 4
+abi_test_clobber_d2:
+ mov r0, #0
+ vmov s4, r0
+ vmov s5, r0
+ bx lr
+.size abi_test_clobber_d2,.-abi_test_clobber_d2
+.type abi_test_clobber_d3, %function
+.globl abi_test_clobber_d3
+.hidden abi_test_clobber_d3
+.align 4
+abi_test_clobber_d3:
+ mov r0, #0
+ vmov s6, r0
+ vmov s7, r0
+ bx lr
+.size abi_test_clobber_d3,.-abi_test_clobber_d3
+.type abi_test_clobber_d4, %function
+.globl abi_test_clobber_d4
+.hidden abi_test_clobber_d4
+.align 4
+abi_test_clobber_d4:
+ mov r0, #0
+ vmov s8, r0
+ vmov s9, r0
+ bx lr
+.size abi_test_clobber_d4,.-abi_test_clobber_d4
+.type abi_test_clobber_d5, %function
+.globl abi_test_clobber_d5
+.hidden abi_test_clobber_d5
+.align 4
+abi_test_clobber_d5:
+ mov r0, #0
+ vmov s10, r0
+ vmov s11, r0
+ bx lr
+.size abi_test_clobber_d5,.-abi_test_clobber_d5
+.type abi_test_clobber_d6, %function
+.globl abi_test_clobber_d6
+.hidden abi_test_clobber_d6
+.align 4
+abi_test_clobber_d6:
+ mov r0, #0
+ vmov s12, r0
+ vmov s13, r0
+ bx lr
+.size abi_test_clobber_d6,.-abi_test_clobber_d6
+.type abi_test_clobber_d7, %function
+.globl abi_test_clobber_d7
+.hidden abi_test_clobber_d7
+.align 4
+abi_test_clobber_d7:
+ mov r0, #0
+ vmov s14, r0
+ vmov s15, r0
+ bx lr
+.size abi_test_clobber_d7,.-abi_test_clobber_d7
+.type abi_test_clobber_d8, %function
+.globl abi_test_clobber_d8
+.hidden abi_test_clobber_d8
+.align 4
+abi_test_clobber_d8:
+ mov r0, #0
+ vmov s16, r0
+ vmov s17, r0
+ bx lr
+.size abi_test_clobber_d8,.-abi_test_clobber_d8
+.type abi_test_clobber_d9, %function
+.globl abi_test_clobber_d9
+.hidden abi_test_clobber_d9
+.align 4
+abi_test_clobber_d9:
+ mov r0, #0
+ vmov s18, r0
+ vmov s19, r0
+ bx lr
+.size abi_test_clobber_d9,.-abi_test_clobber_d9
+.type abi_test_clobber_d10, %function
+.globl abi_test_clobber_d10
+.hidden abi_test_clobber_d10
+.align 4
+abi_test_clobber_d10:
+ mov r0, #0
+ vmov s20, r0
+ vmov s21, r0
+ bx lr
+.size abi_test_clobber_d10,.-abi_test_clobber_d10
+.type abi_test_clobber_d11, %function
+.globl abi_test_clobber_d11
+.hidden abi_test_clobber_d11
+.align 4
+abi_test_clobber_d11:
+ mov r0, #0
+ vmov s22, r0
+ vmov s23, r0
+ bx lr
+.size abi_test_clobber_d11,.-abi_test_clobber_d11
+.type abi_test_clobber_d12, %function
+.globl abi_test_clobber_d12
+.hidden abi_test_clobber_d12
+.align 4
+abi_test_clobber_d12:
+ mov r0, #0
+ vmov s24, r0
+ vmov s25, r0
+ bx lr
+.size abi_test_clobber_d12,.-abi_test_clobber_d12
+.type abi_test_clobber_d13, %function
+.globl abi_test_clobber_d13
+.hidden abi_test_clobber_d13
+.align 4
+abi_test_clobber_d13:
+ mov r0, #0
+ vmov s26, r0
+ vmov s27, r0
+ bx lr
+.size abi_test_clobber_d13,.-abi_test_clobber_d13
+.type abi_test_clobber_d14, %function
+.globl abi_test_clobber_d14
+.hidden abi_test_clobber_d14
+.align 4
+abi_test_clobber_d14:
+ mov r0, #0
+ vmov s28, r0
+ vmov s29, r0
+ bx lr
+.size abi_test_clobber_d14,.-abi_test_clobber_d14
+.type abi_test_clobber_d15, %function
+.globl abi_test_clobber_d15
+.hidden abi_test_clobber_d15
+.align 4
+abi_test_clobber_d15:
+ mov r0, #0
+ vmov s30, r0
+ vmov s31, r0
+ bx lr
+.size abi_test_clobber_d15,.-abi_test_clobber_d15
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__)
diff --git a/gen/test_support/trampoline-armv8-apple.S b/gen/test_support/trampoline-armv8-apple.S
new file mode 100644
index 0000000..99055e0
--- /dev/null
+++ b/gen/test_support/trampoline-armv8-apple.S
@@ -0,0 +1,750 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+// const uint64_t *argv, size_t argc,
+// uint64_t unwind);
+
+.globl _abi_test_trampoline
+.private_extern _abi_test_trampoline
+.align 4
+_abi_test_trampoline:
+Labi_test_trampoline_begin:
+ AARCH64_SIGN_LINK_REGISTER
+ // Stack layout (low to high addresses)
+ // x29,x30 (16 bytes)
+ // d8-d15 (64 bytes)
+ // x19-x28 (80 bytes)
+ // x1 (8 bytes)
+ // padding (8 bytes)
+ stp x29, x30, [sp, #-176]!
+ mov x29, sp
+
+ // Saved callee-saved registers and |state|.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ stp x23, x24, [sp, #112]
+ stp x25, x26, [sp, #128]
+ stp x27, x28, [sp, #144]
+ str x1, [sp, #160]
+
+ // Load registers from |state|, with the exception of x29. x29 is the
+ // frame pointer and also callee-saved, but AAPCS64 allows platforms to
+ // mandate that x29 always point to a frame. iOS64 does so, which means
+ // we cannot fill x29 with entropy without violating ABI rules
+ // ourselves. x29 is tested separately below.
+ ldp d8, d9, [x1], #16
+ ldp d10, d11, [x1], #16
+ ldp d12, d13, [x1], #16
+ ldp d14, d15, [x1], #16
+ ldp x19, x20, [x1], #16
+ ldp x21, x22, [x1], #16
+ ldp x23, x24, [x1], #16
+ ldp x25, x26, [x1], #16
+ ldp x27, x28, [x1], #16
+
+ // Move parameters into temporary registers.
+ mov x9, x0
+ mov x10, x2
+ mov x11, x3
+
+ // Load parameters into registers.
+ cbz x11, Largs_done
+ ldr x0, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x1, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x2, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x3, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x4, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x5, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x6, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x7, [x10], #8
+
+Largs_done:
+ blr x9
+
+ // Reload |state| and store registers.
+ ldr x1, [sp, #160]
+ stp d8, d9, [x1], #16
+ stp d10, d11, [x1], #16
+ stp d12, d13, [x1], #16
+ stp d14, d15, [x1], #16
+ stp x19, x20, [x1], #16
+ stp x21, x22, [x1], #16
+ stp x23, x24, [x1], #16
+ stp x25, x26, [x1], #16
+ stp x27, x28, [x1], #16
+
+ // |func| is required to preserve x29, the frame pointer. We cannot load
+ // random values into x29 (see comment above), so compare it against the
+ // expected value and zero the field of |state| if corrupted.
+ mov x9, sp
+ cmp x29, x9
+ b.eq Lx29_ok
+ str xzr, [x1]
+
+Lx29_ok:
+ // Restore callee-saved registers.
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldp x23, x24, [sp, #112]
+ ldp x25, x26, [sp, #128]
+ ldp x27, x28, [sp, #144]
+
+ ldp x29, x30, [sp], #176
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl _abi_test_clobber_x0
+.private_extern _abi_test_clobber_x0
+.align 4
+_abi_test_clobber_x0:
+ AARCH64_VALID_CALL_TARGET
+ mov x0, xzr
+ ret
+
+
+.globl _abi_test_clobber_x1
+.private_extern _abi_test_clobber_x1
+.align 4
+_abi_test_clobber_x1:
+ AARCH64_VALID_CALL_TARGET
+ mov x1, xzr
+ ret
+
+
+.globl _abi_test_clobber_x2
+.private_extern _abi_test_clobber_x2
+.align 4
+_abi_test_clobber_x2:
+ AARCH64_VALID_CALL_TARGET
+ mov x2, xzr
+ ret
+
+
+.globl _abi_test_clobber_x3
+.private_extern _abi_test_clobber_x3
+.align 4
+_abi_test_clobber_x3:
+ AARCH64_VALID_CALL_TARGET
+ mov x3, xzr
+ ret
+
+
+.globl _abi_test_clobber_x4
+.private_extern _abi_test_clobber_x4
+.align 4
+_abi_test_clobber_x4:
+ AARCH64_VALID_CALL_TARGET
+ mov x4, xzr
+ ret
+
+
+.globl _abi_test_clobber_x5
+.private_extern _abi_test_clobber_x5
+.align 4
+_abi_test_clobber_x5:
+ AARCH64_VALID_CALL_TARGET
+ mov x5, xzr
+ ret
+
+
+.globl _abi_test_clobber_x6
+.private_extern _abi_test_clobber_x6
+.align 4
+_abi_test_clobber_x6:
+ AARCH64_VALID_CALL_TARGET
+ mov x6, xzr
+ ret
+
+
+.globl _abi_test_clobber_x7
+.private_extern _abi_test_clobber_x7
+.align 4
+_abi_test_clobber_x7:
+ AARCH64_VALID_CALL_TARGET
+ mov x7, xzr
+ ret
+
+
+.globl _abi_test_clobber_x8
+.private_extern _abi_test_clobber_x8
+.align 4
+_abi_test_clobber_x8:
+ AARCH64_VALID_CALL_TARGET
+ mov x8, xzr
+ ret
+
+
+.globl _abi_test_clobber_x9
+.private_extern _abi_test_clobber_x9
+.align 4
+_abi_test_clobber_x9:
+ AARCH64_VALID_CALL_TARGET
+ mov x9, xzr
+ ret
+
+
+.globl _abi_test_clobber_x10
+.private_extern _abi_test_clobber_x10
+.align 4
+_abi_test_clobber_x10:
+ AARCH64_VALID_CALL_TARGET
+ mov x10, xzr
+ ret
+
+
+.globl _abi_test_clobber_x11
+.private_extern _abi_test_clobber_x11
+.align 4
+_abi_test_clobber_x11:
+ AARCH64_VALID_CALL_TARGET
+ mov x11, xzr
+ ret
+
+
+.globl _abi_test_clobber_x12
+.private_extern _abi_test_clobber_x12
+.align 4
+_abi_test_clobber_x12:
+ AARCH64_VALID_CALL_TARGET
+ mov x12, xzr
+ ret
+
+
+.globl _abi_test_clobber_x13
+.private_extern _abi_test_clobber_x13
+.align 4
+_abi_test_clobber_x13:
+ AARCH64_VALID_CALL_TARGET
+ mov x13, xzr
+ ret
+
+
+.globl _abi_test_clobber_x14
+.private_extern _abi_test_clobber_x14
+.align 4
+_abi_test_clobber_x14:
+ AARCH64_VALID_CALL_TARGET
+ mov x14, xzr
+ ret
+
+
+.globl _abi_test_clobber_x15
+.private_extern _abi_test_clobber_x15
+.align 4
+_abi_test_clobber_x15:
+ AARCH64_VALID_CALL_TARGET
+ mov x15, xzr
+ ret
+
+
+.globl _abi_test_clobber_x16
+.private_extern _abi_test_clobber_x16
+.align 4
+_abi_test_clobber_x16:
+ AARCH64_VALID_CALL_TARGET
+ mov x16, xzr
+ ret
+
+
+.globl _abi_test_clobber_x17
+.private_extern _abi_test_clobber_x17
+.align 4
+_abi_test_clobber_x17:
+ AARCH64_VALID_CALL_TARGET
+ mov x17, xzr
+ ret
+
+
+.globl _abi_test_clobber_x19
+.private_extern _abi_test_clobber_x19
+.align 4
+_abi_test_clobber_x19:
+ AARCH64_VALID_CALL_TARGET
+ mov x19, xzr
+ ret
+
+
+.globl _abi_test_clobber_x20
+.private_extern _abi_test_clobber_x20
+.align 4
+_abi_test_clobber_x20:
+ AARCH64_VALID_CALL_TARGET
+ mov x20, xzr
+ ret
+
+
+.globl _abi_test_clobber_x21
+.private_extern _abi_test_clobber_x21
+.align 4
+_abi_test_clobber_x21:
+ AARCH64_VALID_CALL_TARGET
+ mov x21, xzr
+ ret
+
+
+.globl _abi_test_clobber_x22
+.private_extern _abi_test_clobber_x22
+.align 4
+_abi_test_clobber_x22:
+ AARCH64_VALID_CALL_TARGET
+ mov x22, xzr
+ ret
+
+
+.globl _abi_test_clobber_x23
+.private_extern _abi_test_clobber_x23
+.align 4
+_abi_test_clobber_x23:
+ AARCH64_VALID_CALL_TARGET
+ mov x23, xzr
+ ret
+
+
+.globl _abi_test_clobber_x24
+.private_extern _abi_test_clobber_x24
+.align 4
+_abi_test_clobber_x24:
+ AARCH64_VALID_CALL_TARGET
+ mov x24, xzr
+ ret
+
+
+.globl _abi_test_clobber_x25
+.private_extern _abi_test_clobber_x25
+.align 4
+_abi_test_clobber_x25:
+ AARCH64_VALID_CALL_TARGET
+ mov x25, xzr
+ ret
+
+
+.globl _abi_test_clobber_x26
+.private_extern _abi_test_clobber_x26
+.align 4
+_abi_test_clobber_x26:
+ AARCH64_VALID_CALL_TARGET
+ mov x26, xzr
+ ret
+
+
+.globl _abi_test_clobber_x27
+.private_extern _abi_test_clobber_x27
+.align 4
+_abi_test_clobber_x27:
+ AARCH64_VALID_CALL_TARGET
+ mov x27, xzr
+ ret
+
+
+.globl _abi_test_clobber_x28
+.private_extern _abi_test_clobber_x28
+.align 4
+_abi_test_clobber_x28:
+ AARCH64_VALID_CALL_TARGET
+ mov x28, xzr
+ ret
+
+
+.globl _abi_test_clobber_x29
+.private_extern _abi_test_clobber_x29
+.align 4
+_abi_test_clobber_x29:
+ AARCH64_VALID_CALL_TARGET
+ mov x29, xzr
+ ret
+
+
+.globl _abi_test_clobber_d0
+.private_extern _abi_test_clobber_d0
+.align 4
+_abi_test_clobber_d0:
+ AARCH64_VALID_CALL_TARGET
+ fmov d0, xzr
+ ret
+
+
+.globl _abi_test_clobber_d1
+.private_extern _abi_test_clobber_d1
+.align 4
+_abi_test_clobber_d1:
+ AARCH64_VALID_CALL_TARGET
+ fmov d1, xzr
+ ret
+
+
+.globl _abi_test_clobber_d2
+.private_extern _abi_test_clobber_d2
+.align 4
+_abi_test_clobber_d2:
+ AARCH64_VALID_CALL_TARGET
+ fmov d2, xzr
+ ret
+
+
+.globl _abi_test_clobber_d3
+.private_extern _abi_test_clobber_d3
+.align 4
+_abi_test_clobber_d3:
+ AARCH64_VALID_CALL_TARGET
+ fmov d3, xzr
+ ret
+
+
+.globl _abi_test_clobber_d4
+.private_extern _abi_test_clobber_d4
+.align 4
+_abi_test_clobber_d4:
+ AARCH64_VALID_CALL_TARGET
+ fmov d4, xzr
+ ret
+
+
+.globl _abi_test_clobber_d5
+.private_extern _abi_test_clobber_d5
+.align 4
+_abi_test_clobber_d5:
+ AARCH64_VALID_CALL_TARGET
+ fmov d5, xzr
+ ret
+
+
+.globl _abi_test_clobber_d6
+.private_extern _abi_test_clobber_d6
+.align 4
+_abi_test_clobber_d6:
+ AARCH64_VALID_CALL_TARGET
+ fmov d6, xzr
+ ret
+
+
+.globl _abi_test_clobber_d7
+.private_extern _abi_test_clobber_d7
+.align 4
+_abi_test_clobber_d7:
+ AARCH64_VALID_CALL_TARGET
+ fmov d7, xzr
+ ret
+
+
+.globl _abi_test_clobber_d8
+.private_extern _abi_test_clobber_d8
+.align 4
+_abi_test_clobber_d8:
+ AARCH64_VALID_CALL_TARGET
+ fmov d8, xzr
+ ret
+
+
+.globl _abi_test_clobber_d9
+.private_extern _abi_test_clobber_d9
+.align 4
+_abi_test_clobber_d9:
+ AARCH64_VALID_CALL_TARGET
+ fmov d9, xzr
+ ret
+
+
+.globl _abi_test_clobber_d10
+.private_extern _abi_test_clobber_d10
+.align 4
+_abi_test_clobber_d10:
+ AARCH64_VALID_CALL_TARGET
+ fmov d10, xzr
+ ret
+
+
+.globl _abi_test_clobber_d11
+.private_extern _abi_test_clobber_d11
+.align 4
+_abi_test_clobber_d11:
+ AARCH64_VALID_CALL_TARGET
+ fmov d11, xzr
+ ret
+
+
+.globl _abi_test_clobber_d12
+.private_extern _abi_test_clobber_d12
+.align 4
+_abi_test_clobber_d12:
+ AARCH64_VALID_CALL_TARGET
+ fmov d12, xzr
+ ret
+
+
+.globl _abi_test_clobber_d13
+.private_extern _abi_test_clobber_d13
+.align 4
+_abi_test_clobber_d13:
+ AARCH64_VALID_CALL_TARGET
+ fmov d13, xzr
+ ret
+
+
+.globl _abi_test_clobber_d14
+.private_extern _abi_test_clobber_d14
+.align 4
+_abi_test_clobber_d14:
+ AARCH64_VALID_CALL_TARGET
+ fmov d14, xzr
+ ret
+
+
+.globl _abi_test_clobber_d15
+.private_extern _abi_test_clobber_d15
+.align 4
+_abi_test_clobber_d15:
+ AARCH64_VALID_CALL_TARGET
+ fmov d15, xzr
+ ret
+
+
+.globl _abi_test_clobber_d16
+.private_extern _abi_test_clobber_d16
+.align 4
+_abi_test_clobber_d16:
+ AARCH64_VALID_CALL_TARGET
+ fmov d16, xzr
+ ret
+
+
+.globl _abi_test_clobber_d17
+.private_extern _abi_test_clobber_d17
+.align 4
+_abi_test_clobber_d17:
+ AARCH64_VALID_CALL_TARGET
+ fmov d17, xzr
+ ret
+
+
+.globl _abi_test_clobber_d18
+.private_extern _abi_test_clobber_d18
+.align 4
+_abi_test_clobber_d18:
+ AARCH64_VALID_CALL_TARGET
+ fmov d18, xzr
+ ret
+
+
+.globl _abi_test_clobber_d19
+.private_extern _abi_test_clobber_d19
+.align 4
+_abi_test_clobber_d19:
+ AARCH64_VALID_CALL_TARGET
+ fmov d19, xzr
+ ret
+
+
+.globl _abi_test_clobber_d20
+.private_extern _abi_test_clobber_d20
+.align 4
+_abi_test_clobber_d20:
+ AARCH64_VALID_CALL_TARGET
+ fmov d20, xzr
+ ret
+
+
+.globl _abi_test_clobber_d21
+.private_extern _abi_test_clobber_d21
+.align 4
+_abi_test_clobber_d21:
+ AARCH64_VALID_CALL_TARGET
+ fmov d21, xzr
+ ret
+
+
+.globl _abi_test_clobber_d22
+.private_extern _abi_test_clobber_d22
+.align 4
+_abi_test_clobber_d22:
+ AARCH64_VALID_CALL_TARGET
+ fmov d22, xzr
+ ret
+
+
+.globl _abi_test_clobber_d23
+.private_extern _abi_test_clobber_d23
+.align 4
+_abi_test_clobber_d23:
+ AARCH64_VALID_CALL_TARGET
+ fmov d23, xzr
+ ret
+
+
+.globl _abi_test_clobber_d24
+.private_extern _abi_test_clobber_d24
+.align 4
+_abi_test_clobber_d24:
+ AARCH64_VALID_CALL_TARGET
+ fmov d24, xzr
+ ret
+
+
+.globl _abi_test_clobber_d25
+.private_extern _abi_test_clobber_d25
+.align 4
+_abi_test_clobber_d25:
+ AARCH64_VALID_CALL_TARGET
+ fmov d25, xzr
+ ret
+
+
+.globl _abi_test_clobber_d26
+.private_extern _abi_test_clobber_d26
+.align 4
+_abi_test_clobber_d26:
+ AARCH64_VALID_CALL_TARGET
+ fmov d26, xzr
+ ret
+
+
+.globl _abi_test_clobber_d27
+.private_extern _abi_test_clobber_d27
+.align 4
+_abi_test_clobber_d27:
+ AARCH64_VALID_CALL_TARGET
+ fmov d27, xzr
+ ret
+
+
+.globl _abi_test_clobber_d28
+.private_extern _abi_test_clobber_d28
+.align 4
+_abi_test_clobber_d28:
+ AARCH64_VALID_CALL_TARGET
+ fmov d28, xzr
+ ret
+
+
+.globl _abi_test_clobber_d29
+.private_extern _abi_test_clobber_d29
+.align 4
+_abi_test_clobber_d29:
+ AARCH64_VALID_CALL_TARGET
+ fmov d29, xzr
+ ret
+
+
+.globl _abi_test_clobber_d30
+.private_extern _abi_test_clobber_d30
+.align 4
+_abi_test_clobber_d30:
+ AARCH64_VALID_CALL_TARGET
+ fmov d30, xzr
+ ret
+
+
+.globl _abi_test_clobber_d31
+.private_extern _abi_test_clobber_d31
+.align 4
+_abi_test_clobber_d31:
+ AARCH64_VALID_CALL_TARGET
+ fmov d31, xzr
+ ret
+
+
+.globl _abi_test_clobber_v8_upper
+.private_extern _abi_test_clobber_v8_upper
+.align 4
+_abi_test_clobber_v8_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v8.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v9_upper
+.private_extern _abi_test_clobber_v9_upper
+.align 4
+_abi_test_clobber_v9_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v9.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v10_upper
+.private_extern _abi_test_clobber_v10_upper
+.align 4
+_abi_test_clobber_v10_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v10.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v11_upper
+.private_extern _abi_test_clobber_v11_upper
+.align 4
+_abi_test_clobber_v11_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v11.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v12_upper
+.private_extern _abi_test_clobber_v12_upper
+.align 4
+_abi_test_clobber_v12_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v12.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v13_upper
+.private_extern _abi_test_clobber_v13_upper
+.align 4
+_abi_test_clobber_v13_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v13.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v14_upper
+.private_extern _abi_test_clobber_v14_upper
+.align 4
+_abi_test_clobber_v14_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v14.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v15_upper
+.private_extern _abi_test_clobber_v15_upper
+.align 4
+_abi_test_clobber_v15_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v15.d[1], xzr
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
diff --git a/gen/test_support/trampoline-armv8-linux.S b/gen/test_support/trampoline-armv8-linux.S
new file mode 100644
index 0000000..58b4b93
--- /dev/null
+++ b/gen/test_support/trampoline-armv8-linux.S
@@ -0,0 +1,750 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+// const uint64_t *argv, size_t argc,
+// uint64_t unwind);
+.type abi_test_trampoline, %function
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.align 4
+abi_test_trampoline:
+.Labi_test_trampoline_begin:
+ AARCH64_SIGN_LINK_REGISTER
+ // Stack layout (low to high addresses)
+ // x29,x30 (16 bytes)
+ // d8-d15 (64 bytes)
+ // x19-x28 (80 bytes)
+ // x1 (8 bytes)
+ // padding (8 bytes)
+ stp x29, x30, [sp, #-176]!
+ mov x29, sp
+
+ // Saved callee-saved registers and |state|.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ stp x23, x24, [sp, #112]
+ stp x25, x26, [sp, #128]
+ stp x27, x28, [sp, #144]
+ str x1, [sp, #160]
+
+ // Load registers from |state|, with the exception of x29. x29 is the
+ // frame pointer and also callee-saved, but AAPCS64 allows platforms to
+ // mandate that x29 always point to a frame. iOS64 does so, which means
+ // we cannot fill x29 with entropy without violating ABI rules
+ // ourselves. x29 is tested separately below.
+ ldp d8, d9, [x1], #16
+ ldp d10, d11, [x1], #16
+ ldp d12, d13, [x1], #16
+ ldp d14, d15, [x1], #16
+ ldp x19, x20, [x1], #16
+ ldp x21, x22, [x1], #16
+ ldp x23, x24, [x1], #16
+ ldp x25, x26, [x1], #16
+ ldp x27, x28, [x1], #16
+
+ // Move parameters into temporary registers.
+ mov x9, x0
+ mov x10, x2
+ mov x11, x3
+
+ // Load parameters into registers.
+ cbz x11, .Largs_done
+ ldr x0, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x1, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x2, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x3, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x4, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x5, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x6, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x7, [x10], #8
+
+.Largs_done:
+ blr x9
+
+ // Reload |state| and store registers.
+ ldr x1, [sp, #160]
+ stp d8, d9, [x1], #16
+ stp d10, d11, [x1], #16
+ stp d12, d13, [x1], #16
+ stp d14, d15, [x1], #16
+ stp x19, x20, [x1], #16
+ stp x21, x22, [x1], #16
+ stp x23, x24, [x1], #16
+ stp x25, x26, [x1], #16
+ stp x27, x28, [x1], #16
+
+ // |func| is required to preserve x29, the frame pointer. We cannot load
+ // random values into x29 (see comment above), so compare it against the
+ // expected value and zero the field of |state| if corrupted.
+ mov x9, sp
+ cmp x29, x9
+ b.eq .Lx29_ok
+ str xzr, [x1]
+
+.Lx29_ok:
+ // Restore callee-saved registers.
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldp x23, x24, [sp, #112]
+ ldp x25, x26, [sp, #128]
+ ldp x27, x28, [sp, #144]
+
+ ldp x29, x30, [sp], #176
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size abi_test_trampoline,.-abi_test_trampoline
+.type abi_test_clobber_x0, %function
+.globl abi_test_clobber_x0
+.hidden abi_test_clobber_x0
+.align 4
+abi_test_clobber_x0:
+ AARCH64_VALID_CALL_TARGET
+ mov x0, xzr
+ ret
+.size abi_test_clobber_x0,.-abi_test_clobber_x0
+.type abi_test_clobber_x1, %function
+.globl abi_test_clobber_x1
+.hidden abi_test_clobber_x1
+.align 4
+abi_test_clobber_x1:
+ AARCH64_VALID_CALL_TARGET
+ mov x1, xzr
+ ret
+.size abi_test_clobber_x1,.-abi_test_clobber_x1
+.type abi_test_clobber_x2, %function
+.globl abi_test_clobber_x2
+.hidden abi_test_clobber_x2
+.align 4
+abi_test_clobber_x2:
+ AARCH64_VALID_CALL_TARGET
+ mov x2, xzr
+ ret
+.size abi_test_clobber_x2,.-abi_test_clobber_x2
+.type abi_test_clobber_x3, %function
+.globl abi_test_clobber_x3
+.hidden abi_test_clobber_x3
+.align 4
+abi_test_clobber_x3:
+ AARCH64_VALID_CALL_TARGET
+ mov x3, xzr
+ ret
+.size abi_test_clobber_x3,.-abi_test_clobber_x3
+.type abi_test_clobber_x4, %function
+.globl abi_test_clobber_x4
+.hidden abi_test_clobber_x4
+.align 4
+abi_test_clobber_x4:
+ AARCH64_VALID_CALL_TARGET
+ mov x4, xzr
+ ret
+.size abi_test_clobber_x4,.-abi_test_clobber_x4
+.type abi_test_clobber_x5, %function
+.globl abi_test_clobber_x5
+.hidden abi_test_clobber_x5
+.align 4
+abi_test_clobber_x5:
+ AARCH64_VALID_CALL_TARGET
+ mov x5, xzr
+ ret
+.size abi_test_clobber_x5,.-abi_test_clobber_x5
+.type abi_test_clobber_x6, %function
+.globl abi_test_clobber_x6
+.hidden abi_test_clobber_x6
+.align 4
+abi_test_clobber_x6:
+ AARCH64_VALID_CALL_TARGET
+ mov x6, xzr
+ ret
+.size abi_test_clobber_x6,.-abi_test_clobber_x6
+.type abi_test_clobber_x7, %function
+.globl abi_test_clobber_x7
+.hidden abi_test_clobber_x7
+.align 4
+abi_test_clobber_x7:
+ AARCH64_VALID_CALL_TARGET
+ mov x7, xzr
+ ret
+.size abi_test_clobber_x7,.-abi_test_clobber_x7
+.type abi_test_clobber_x8, %function
+.globl abi_test_clobber_x8
+.hidden abi_test_clobber_x8
+.align 4
+abi_test_clobber_x8:
+ AARCH64_VALID_CALL_TARGET
+ mov x8, xzr
+ ret
+.size abi_test_clobber_x8,.-abi_test_clobber_x8
+.type abi_test_clobber_x9, %function
+.globl abi_test_clobber_x9
+.hidden abi_test_clobber_x9
+.align 4
+abi_test_clobber_x9:
+ AARCH64_VALID_CALL_TARGET
+ mov x9, xzr
+ ret
+.size abi_test_clobber_x9,.-abi_test_clobber_x9
+.type abi_test_clobber_x10, %function
+.globl abi_test_clobber_x10
+.hidden abi_test_clobber_x10
+.align 4
+abi_test_clobber_x10:
+ AARCH64_VALID_CALL_TARGET
+ mov x10, xzr
+ ret
+.size abi_test_clobber_x10,.-abi_test_clobber_x10
+.type abi_test_clobber_x11, %function
+.globl abi_test_clobber_x11
+.hidden abi_test_clobber_x11
+.align 4
+abi_test_clobber_x11:
+ AARCH64_VALID_CALL_TARGET
+ mov x11, xzr
+ ret
+.size abi_test_clobber_x11,.-abi_test_clobber_x11
+.type abi_test_clobber_x12, %function
+.globl abi_test_clobber_x12
+.hidden abi_test_clobber_x12
+.align 4
+abi_test_clobber_x12:
+ AARCH64_VALID_CALL_TARGET
+ mov x12, xzr
+ ret
+.size abi_test_clobber_x12,.-abi_test_clobber_x12
+.type abi_test_clobber_x13, %function
+.globl abi_test_clobber_x13
+.hidden abi_test_clobber_x13
+.align 4
+abi_test_clobber_x13:
+ AARCH64_VALID_CALL_TARGET
+ mov x13, xzr
+ ret
+.size abi_test_clobber_x13,.-abi_test_clobber_x13
+.type abi_test_clobber_x14, %function
+.globl abi_test_clobber_x14
+.hidden abi_test_clobber_x14
+.align 4
+abi_test_clobber_x14:
+ AARCH64_VALID_CALL_TARGET
+ mov x14, xzr
+ ret
+.size abi_test_clobber_x14,.-abi_test_clobber_x14
+.type abi_test_clobber_x15, %function
+.globl abi_test_clobber_x15
+.hidden abi_test_clobber_x15
+.align 4
+abi_test_clobber_x15:
+ AARCH64_VALID_CALL_TARGET
+ mov x15, xzr
+ ret
+.size abi_test_clobber_x15,.-abi_test_clobber_x15
+.type abi_test_clobber_x16, %function
+.globl abi_test_clobber_x16
+.hidden abi_test_clobber_x16
+.align 4
+abi_test_clobber_x16:
+ AARCH64_VALID_CALL_TARGET
+ mov x16, xzr
+ ret
+.size abi_test_clobber_x16,.-abi_test_clobber_x16
+.type abi_test_clobber_x17, %function
+.globl abi_test_clobber_x17
+.hidden abi_test_clobber_x17
+.align 4
+abi_test_clobber_x17:
+ AARCH64_VALID_CALL_TARGET
+ mov x17, xzr
+ ret
+.size abi_test_clobber_x17,.-abi_test_clobber_x17
+.type abi_test_clobber_x19, %function
+.globl abi_test_clobber_x19
+.hidden abi_test_clobber_x19
+.align 4
+abi_test_clobber_x19:
+ AARCH64_VALID_CALL_TARGET
+ mov x19, xzr
+ ret
+.size abi_test_clobber_x19,.-abi_test_clobber_x19
+.type abi_test_clobber_x20, %function
+.globl abi_test_clobber_x20
+.hidden abi_test_clobber_x20
+.align 4
+abi_test_clobber_x20:
+ AARCH64_VALID_CALL_TARGET
+ mov x20, xzr
+ ret
+.size abi_test_clobber_x20,.-abi_test_clobber_x20
+.type abi_test_clobber_x21, %function
+.globl abi_test_clobber_x21
+.hidden abi_test_clobber_x21
+.align 4
+abi_test_clobber_x21:
+ AARCH64_VALID_CALL_TARGET
+ mov x21, xzr
+ ret
+.size abi_test_clobber_x21,.-abi_test_clobber_x21
+.type abi_test_clobber_x22, %function
+.globl abi_test_clobber_x22
+.hidden abi_test_clobber_x22
+.align 4
+abi_test_clobber_x22:
+ AARCH64_VALID_CALL_TARGET
+ mov x22, xzr
+ ret
+.size abi_test_clobber_x22,.-abi_test_clobber_x22
+.type abi_test_clobber_x23, %function
+.globl abi_test_clobber_x23
+.hidden abi_test_clobber_x23
+.align 4
+abi_test_clobber_x23:
+ AARCH64_VALID_CALL_TARGET
+ mov x23, xzr
+ ret
+.size abi_test_clobber_x23,.-abi_test_clobber_x23
+.type abi_test_clobber_x24, %function
+.globl abi_test_clobber_x24
+.hidden abi_test_clobber_x24
+.align 4
+abi_test_clobber_x24:
+ AARCH64_VALID_CALL_TARGET
+ mov x24, xzr
+ ret
+.size abi_test_clobber_x24,.-abi_test_clobber_x24
+.type abi_test_clobber_x25, %function
+.globl abi_test_clobber_x25
+.hidden abi_test_clobber_x25
+.align 4
+abi_test_clobber_x25:
+ AARCH64_VALID_CALL_TARGET
+ mov x25, xzr
+ ret
+.size abi_test_clobber_x25,.-abi_test_clobber_x25
+.type abi_test_clobber_x26, %function
+.globl abi_test_clobber_x26
+.hidden abi_test_clobber_x26
+.align 4
+abi_test_clobber_x26:
+ AARCH64_VALID_CALL_TARGET
+ mov x26, xzr
+ ret
+.size abi_test_clobber_x26,.-abi_test_clobber_x26
+.type abi_test_clobber_x27, %function
+.globl abi_test_clobber_x27
+.hidden abi_test_clobber_x27
+.align 4
+abi_test_clobber_x27:
+ AARCH64_VALID_CALL_TARGET
+ mov x27, xzr
+ ret
+.size abi_test_clobber_x27,.-abi_test_clobber_x27
+.type abi_test_clobber_x28, %function
+.globl abi_test_clobber_x28
+.hidden abi_test_clobber_x28
+.align 4
+abi_test_clobber_x28:
+ AARCH64_VALID_CALL_TARGET
+ mov x28, xzr
+ ret
+.size abi_test_clobber_x28,.-abi_test_clobber_x28
+.type abi_test_clobber_x29, %function
+.globl abi_test_clobber_x29
+.hidden abi_test_clobber_x29
+.align 4
+abi_test_clobber_x29:
+ AARCH64_VALID_CALL_TARGET
+ mov x29, xzr
+ ret
+.size abi_test_clobber_x29,.-abi_test_clobber_x29
+.type abi_test_clobber_d0, %function
+.globl abi_test_clobber_d0
+.hidden abi_test_clobber_d0
+.align 4
+abi_test_clobber_d0:
+ AARCH64_VALID_CALL_TARGET
+ fmov d0, xzr
+ ret
+.size abi_test_clobber_d0,.-abi_test_clobber_d0
+.type abi_test_clobber_d1, %function
+.globl abi_test_clobber_d1
+.hidden abi_test_clobber_d1
+.align 4
+abi_test_clobber_d1:
+ AARCH64_VALID_CALL_TARGET
+ fmov d1, xzr
+ ret
+.size abi_test_clobber_d1,.-abi_test_clobber_d1
+.type abi_test_clobber_d2, %function
+.globl abi_test_clobber_d2
+.hidden abi_test_clobber_d2
+.align 4
+abi_test_clobber_d2:
+ AARCH64_VALID_CALL_TARGET
+ fmov d2, xzr
+ ret
+.size abi_test_clobber_d2,.-abi_test_clobber_d2
+.type abi_test_clobber_d3, %function
+.globl abi_test_clobber_d3
+.hidden abi_test_clobber_d3
+.align 4
+abi_test_clobber_d3:
+ AARCH64_VALID_CALL_TARGET
+ fmov d3, xzr
+ ret
+.size abi_test_clobber_d3,.-abi_test_clobber_d3
+.type abi_test_clobber_d4, %function
+.globl abi_test_clobber_d4
+.hidden abi_test_clobber_d4
+.align 4
+abi_test_clobber_d4:
+ AARCH64_VALID_CALL_TARGET
+ fmov d4, xzr
+ ret
+.size abi_test_clobber_d4,.-abi_test_clobber_d4
+.type abi_test_clobber_d5, %function
+.globl abi_test_clobber_d5
+.hidden abi_test_clobber_d5
+.align 4
+abi_test_clobber_d5:
+ AARCH64_VALID_CALL_TARGET
+ fmov d5, xzr
+ ret
+.size abi_test_clobber_d5,.-abi_test_clobber_d5
+.type abi_test_clobber_d6, %function
+.globl abi_test_clobber_d6
+.hidden abi_test_clobber_d6
+.align 4
+abi_test_clobber_d6:
+ AARCH64_VALID_CALL_TARGET
+ fmov d6, xzr
+ ret
+.size abi_test_clobber_d6,.-abi_test_clobber_d6
+.type abi_test_clobber_d7, %function
+.globl abi_test_clobber_d7
+.hidden abi_test_clobber_d7
+.align 4
+abi_test_clobber_d7:
+ AARCH64_VALID_CALL_TARGET
+ fmov d7, xzr
+ ret
+.size abi_test_clobber_d7,.-abi_test_clobber_d7
+.type abi_test_clobber_d8, %function
+.globl abi_test_clobber_d8
+.hidden abi_test_clobber_d8
+.align 4
+abi_test_clobber_d8:
+ AARCH64_VALID_CALL_TARGET
+ fmov d8, xzr
+ ret
+.size abi_test_clobber_d8,.-abi_test_clobber_d8
+.type abi_test_clobber_d9, %function
+.globl abi_test_clobber_d9
+.hidden abi_test_clobber_d9
+.align 4
+abi_test_clobber_d9:
+ AARCH64_VALID_CALL_TARGET
+ fmov d9, xzr
+ ret
+.size abi_test_clobber_d9,.-abi_test_clobber_d9
+.type abi_test_clobber_d10, %function
+.globl abi_test_clobber_d10
+.hidden abi_test_clobber_d10
+.align 4
+abi_test_clobber_d10:
+ AARCH64_VALID_CALL_TARGET
+ fmov d10, xzr
+ ret
+.size abi_test_clobber_d10,.-abi_test_clobber_d10
+.type abi_test_clobber_d11, %function
+.globl abi_test_clobber_d11
+.hidden abi_test_clobber_d11
+.align 4
+abi_test_clobber_d11:
+ AARCH64_VALID_CALL_TARGET
+ fmov d11, xzr
+ ret
+.size abi_test_clobber_d11,.-abi_test_clobber_d11
+.type abi_test_clobber_d12, %function
+.globl abi_test_clobber_d12
+.hidden abi_test_clobber_d12
+.align 4
+abi_test_clobber_d12:
+ AARCH64_VALID_CALL_TARGET
+ fmov d12, xzr
+ ret
+.size abi_test_clobber_d12,.-abi_test_clobber_d12
+.type abi_test_clobber_d13, %function
+.globl abi_test_clobber_d13
+.hidden abi_test_clobber_d13
+.align 4
+abi_test_clobber_d13:
+ AARCH64_VALID_CALL_TARGET
+ fmov d13, xzr
+ ret
+.size abi_test_clobber_d13,.-abi_test_clobber_d13
+.type abi_test_clobber_d14, %function
+.globl abi_test_clobber_d14
+.hidden abi_test_clobber_d14
+.align 4
+abi_test_clobber_d14:
+ AARCH64_VALID_CALL_TARGET
+ fmov d14, xzr
+ ret
+.size abi_test_clobber_d14,.-abi_test_clobber_d14
+.type abi_test_clobber_d15, %function
+.globl abi_test_clobber_d15
+.hidden abi_test_clobber_d15
+.align 4
+abi_test_clobber_d15:
+ AARCH64_VALID_CALL_TARGET
+ fmov d15, xzr
+ ret
+.size abi_test_clobber_d15,.-abi_test_clobber_d15
+.type abi_test_clobber_d16, %function
+.globl abi_test_clobber_d16
+.hidden abi_test_clobber_d16
+.align 4
+abi_test_clobber_d16:
+ AARCH64_VALID_CALL_TARGET
+ fmov d16, xzr
+ ret
+.size abi_test_clobber_d16,.-abi_test_clobber_d16
+.type abi_test_clobber_d17, %function
+.globl abi_test_clobber_d17
+.hidden abi_test_clobber_d17
+.align 4
+abi_test_clobber_d17:
+ AARCH64_VALID_CALL_TARGET
+ fmov d17, xzr
+ ret
+.size abi_test_clobber_d17,.-abi_test_clobber_d17
+.type abi_test_clobber_d18, %function
+.globl abi_test_clobber_d18
+.hidden abi_test_clobber_d18
+.align 4
+abi_test_clobber_d18:
+ AARCH64_VALID_CALL_TARGET
+ fmov d18, xzr
+ ret
+.size abi_test_clobber_d18,.-abi_test_clobber_d18
+.type abi_test_clobber_d19, %function
+.globl abi_test_clobber_d19
+.hidden abi_test_clobber_d19
+.align 4
+abi_test_clobber_d19:
+ AARCH64_VALID_CALL_TARGET
+ fmov d19, xzr
+ ret
+.size abi_test_clobber_d19,.-abi_test_clobber_d19
+.type abi_test_clobber_d20, %function
+.globl abi_test_clobber_d20
+.hidden abi_test_clobber_d20
+.align 4
+abi_test_clobber_d20:
+ AARCH64_VALID_CALL_TARGET
+ fmov d20, xzr
+ ret
+.size abi_test_clobber_d20,.-abi_test_clobber_d20
+.type abi_test_clobber_d21, %function
+.globl abi_test_clobber_d21
+.hidden abi_test_clobber_d21
+.align 4
+abi_test_clobber_d21:
+ AARCH64_VALID_CALL_TARGET
+ fmov d21, xzr
+ ret
+.size abi_test_clobber_d21,.-abi_test_clobber_d21
+.type abi_test_clobber_d22, %function
+.globl abi_test_clobber_d22
+.hidden abi_test_clobber_d22
+.align 4
+abi_test_clobber_d22:
+ AARCH64_VALID_CALL_TARGET
+ fmov d22, xzr
+ ret
+.size abi_test_clobber_d22,.-abi_test_clobber_d22
+.type abi_test_clobber_d23, %function
+.globl abi_test_clobber_d23
+.hidden abi_test_clobber_d23
+.align 4
+abi_test_clobber_d23:
+ AARCH64_VALID_CALL_TARGET
+ fmov d23, xzr
+ ret
+.size abi_test_clobber_d23,.-abi_test_clobber_d23
+.type abi_test_clobber_d24, %function
+.globl abi_test_clobber_d24
+.hidden abi_test_clobber_d24
+.align 4
+abi_test_clobber_d24:
+ AARCH64_VALID_CALL_TARGET
+ fmov d24, xzr
+ ret
+.size abi_test_clobber_d24,.-abi_test_clobber_d24
+.type abi_test_clobber_d25, %function
+.globl abi_test_clobber_d25
+.hidden abi_test_clobber_d25
+.align 4
+abi_test_clobber_d25:
+ AARCH64_VALID_CALL_TARGET
+ fmov d25, xzr
+ ret
+.size abi_test_clobber_d25,.-abi_test_clobber_d25
+.type abi_test_clobber_d26, %function
+.globl abi_test_clobber_d26
+.hidden abi_test_clobber_d26
+.align 4
+abi_test_clobber_d26:
+ AARCH64_VALID_CALL_TARGET
+ fmov d26, xzr
+ ret
+.size abi_test_clobber_d26,.-abi_test_clobber_d26
+.type abi_test_clobber_d27, %function
+.globl abi_test_clobber_d27
+.hidden abi_test_clobber_d27
+.align 4
+abi_test_clobber_d27:
+ AARCH64_VALID_CALL_TARGET
+ fmov d27, xzr
+ ret
+.size abi_test_clobber_d27,.-abi_test_clobber_d27
+.type abi_test_clobber_d28, %function
+.globl abi_test_clobber_d28
+.hidden abi_test_clobber_d28
+.align 4
+abi_test_clobber_d28:
+ AARCH64_VALID_CALL_TARGET
+ fmov d28, xzr
+ ret
+.size abi_test_clobber_d28,.-abi_test_clobber_d28
+.type abi_test_clobber_d29, %function
+.globl abi_test_clobber_d29
+.hidden abi_test_clobber_d29
+.align 4
+abi_test_clobber_d29:
+ AARCH64_VALID_CALL_TARGET
+ fmov d29, xzr
+ ret
+.size abi_test_clobber_d29,.-abi_test_clobber_d29
+.type abi_test_clobber_d30, %function
+.globl abi_test_clobber_d30
+.hidden abi_test_clobber_d30
+.align 4
+abi_test_clobber_d30:
+ AARCH64_VALID_CALL_TARGET
+ fmov d30, xzr
+ ret
+.size abi_test_clobber_d30,.-abi_test_clobber_d30
+.type abi_test_clobber_d31, %function
+.globl abi_test_clobber_d31
+.hidden abi_test_clobber_d31
+.align 4
+abi_test_clobber_d31:
+ AARCH64_VALID_CALL_TARGET
+ fmov d31, xzr
+ ret
+.size abi_test_clobber_d31,.-abi_test_clobber_d31
+.type abi_test_clobber_v8_upper, %function
+.globl abi_test_clobber_v8_upper
+.hidden abi_test_clobber_v8_upper
+.align 4
+abi_test_clobber_v8_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v8.d[1], xzr
+ ret
+.size abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper
+.type abi_test_clobber_v9_upper, %function
+.globl abi_test_clobber_v9_upper
+.hidden abi_test_clobber_v9_upper
+.align 4
+abi_test_clobber_v9_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v9.d[1], xzr
+ ret
+.size abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper
+.type abi_test_clobber_v10_upper, %function
+.globl abi_test_clobber_v10_upper
+.hidden abi_test_clobber_v10_upper
+.align 4
+abi_test_clobber_v10_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v10.d[1], xzr
+ ret
+.size abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper
+.type abi_test_clobber_v11_upper, %function
+.globl abi_test_clobber_v11_upper
+.hidden abi_test_clobber_v11_upper
+.align 4
+abi_test_clobber_v11_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v11.d[1], xzr
+ ret
+.size abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper
+.type abi_test_clobber_v12_upper, %function
+.globl abi_test_clobber_v12_upper
+.hidden abi_test_clobber_v12_upper
+.align 4
+abi_test_clobber_v12_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v12.d[1], xzr
+ ret
+.size abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper
+.type abi_test_clobber_v13_upper, %function
+.globl abi_test_clobber_v13_upper
+.hidden abi_test_clobber_v13_upper
+.align 4
+abi_test_clobber_v13_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v13.d[1], xzr
+ ret
+.size abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper
+.type abi_test_clobber_v14_upper, %function
+.globl abi_test_clobber_v14_upper
+.hidden abi_test_clobber_v14_upper
+.align 4
+abi_test_clobber_v14_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v14.d[1], xzr
+ ret
+.size abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper
+.type abi_test_clobber_v15_upper, %function
+.globl abi_test_clobber_v15_upper
+.hidden abi_test_clobber_v15_upper
+.align 4
+abi_test_clobber_v15_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v15.d[1], xzr
+ ret
+.size abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
diff --git a/gen/test_support/trampoline-armv8-win.S b/gen/test_support/trampoline-armv8-win.S
new file mode 100644
index 0000000..14773e3
--- /dev/null
+++ b/gen/test_support/trampoline-armv8-win.S
@@ -0,0 +1,750 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+// const uint64_t *argv, size_t argc,
+// uint64_t unwind);
+
+.globl abi_test_trampoline
+
+.align 4
+abi_test_trampoline:
+Labi_test_trampoline_begin:
+ AARCH64_SIGN_LINK_REGISTER
+ // Stack layout (low to high addresses)
+ // x29,x30 (16 bytes)
+ // d8-d15 (64 bytes)
+ // x19-x28 (80 bytes)
+ // x1 (8 bytes)
+ // padding (8 bytes)
+ stp x29, x30, [sp, #-176]!
+ mov x29, sp
+
+ // Saved callee-saved registers and |state|.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ stp x23, x24, [sp, #112]
+ stp x25, x26, [sp, #128]
+ stp x27, x28, [sp, #144]
+ str x1, [sp, #160]
+
+ // Load registers from |state|, with the exception of x29. x29 is the
+ // frame pointer and also callee-saved, but AAPCS64 allows platforms to
+ // mandate that x29 always point to a frame. iOS64 does so, which means
+ // we cannot fill x29 with entropy without violating ABI rules
+ // ourselves. x29 is tested separately below.
+ ldp d8, d9, [x1], #16
+ ldp d10, d11, [x1], #16
+ ldp d12, d13, [x1], #16
+ ldp d14, d15, [x1], #16
+ ldp x19, x20, [x1], #16
+ ldp x21, x22, [x1], #16
+ ldp x23, x24, [x1], #16
+ ldp x25, x26, [x1], #16
+ ldp x27, x28, [x1], #16
+
+ // Move parameters into temporary registers.
+ mov x9, x0
+ mov x10, x2
+ mov x11, x3
+
+ // Load parameters into registers.
+ cbz x11, Largs_done
+ ldr x0, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x1, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x2, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x3, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x4, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x5, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x6, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x7, [x10], #8
+
+Largs_done:
+ blr x9
+
+ // Reload |state| and store registers.
+ ldr x1, [sp, #160]
+ stp d8, d9, [x1], #16
+ stp d10, d11, [x1], #16
+ stp d12, d13, [x1], #16
+ stp d14, d15, [x1], #16
+ stp x19, x20, [x1], #16
+ stp x21, x22, [x1], #16
+ stp x23, x24, [x1], #16
+ stp x25, x26, [x1], #16
+ stp x27, x28, [x1], #16
+
+ // |func| is required to preserve x29, the frame pointer. We cannot load
+ // random values into x29 (see comment above), so compare it against the
+ // expected value and zero the field of |state| if corrupted.
+ mov x9, sp
+ cmp x29, x9
+ b.eq Lx29_ok
+ str xzr, [x1]
+
+Lx29_ok:
+ // Restore callee-saved registers.
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldp x23, x24, [sp, #112]
+ ldp x25, x26, [sp, #128]
+ ldp x27, x28, [sp, #144]
+
+ ldp x29, x30, [sp], #176
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl abi_test_clobber_x0
+
+.align 4
+abi_test_clobber_x0:
+ AARCH64_VALID_CALL_TARGET
+ mov x0, xzr
+ ret
+
+
+.globl abi_test_clobber_x1
+
+.align 4
+abi_test_clobber_x1:
+ AARCH64_VALID_CALL_TARGET
+ mov x1, xzr
+ ret
+
+
+.globl abi_test_clobber_x2
+
+.align 4
+abi_test_clobber_x2:
+ AARCH64_VALID_CALL_TARGET
+ mov x2, xzr
+ ret
+
+
+.globl abi_test_clobber_x3
+
+.align 4
+abi_test_clobber_x3:
+ AARCH64_VALID_CALL_TARGET
+ mov x3, xzr
+ ret
+
+
+.globl abi_test_clobber_x4
+
+.align 4
+abi_test_clobber_x4:
+ AARCH64_VALID_CALL_TARGET
+ mov x4, xzr
+ ret
+
+
+.globl abi_test_clobber_x5
+
+.align 4
+abi_test_clobber_x5:
+ AARCH64_VALID_CALL_TARGET
+ mov x5, xzr
+ ret
+
+
+.globl abi_test_clobber_x6
+
+.align 4
+abi_test_clobber_x6:
+ AARCH64_VALID_CALL_TARGET
+ mov x6, xzr
+ ret
+
+
+.globl abi_test_clobber_x7
+
+.align 4
+abi_test_clobber_x7:
+ AARCH64_VALID_CALL_TARGET
+ mov x7, xzr
+ ret
+
+
+.globl abi_test_clobber_x8
+
+.align 4
+abi_test_clobber_x8:
+ AARCH64_VALID_CALL_TARGET
+ mov x8, xzr
+ ret
+
+
+.globl abi_test_clobber_x9
+
+.align 4
+abi_test_clobber_x9:
+ AARCH64_VALID_CALL_TARGET
+ mov x9, xzr
+ ret
+
+
+.globl abi_test_clobber_x10
+
+.align 4
+abi_test_clobber_x10:
+ AARCH64_VALID_CALL_TARGET
+ mov x10, xzr
+ ret
+
+
+.globl abi_test_clobber_x11
+
+.align 4
+abi_test_clobber_x11:
+ AARCH64_VALID_CALL_TARGET
+ mov x11, xzr
+ ret
+
+
+.globl abi_test_clobber_x12
+
+.align 4
+abi_test_clobber_x12:
+ AARCH64_VALID_CALL_TARGET
+ mov x12, xzr
+ ret
+
+
+.globl abi_test_clobber_x13
+
+.align 4
+abi_test_clobber_x13:
+ AARCH64_VALID_CALL_TARGET
+ mov x13, xzr
+ ret
+
+
+.globl abi_test_clobber_x14
+
+.align 4
+abi_test_clobber_x14:
+ AARCH64_VALID_CALL_TARGET
+ mov x14, xzr
+ ret
+
+
+.globl abi_test_clobber_x15
+
+.align 4
+abi_test_clobber_x15:
+ AARCH64_VALID_CALL_TARGET
+ mov x15, xzr
+ ret
+
+
+.globl abi_test_clobber_x16
+
+.align 4
+abi_test_clobber_x16:
+ AARCH64_VALID_CALL_TARGET
+ mov x16, xzr
+ ret
+
+
+.globl abi_test_clobber_x17
+
+.align 4
+abi_test_clobber_x17:
+ AARCH64_VALID_CALL_TARGET
+ mov x17, xzr
+ ret
+
+
+.globl abi_test_clobber_x19
+
+.align 4
+abi_test_clobber_x19:
+ AARCH64_VALID_CALL_TARGET
+ mov x19, xzr
+ ret
+
+
+.globl abi_test_clobber_x20
+
+.align 4
+abi_test_clobber_x20:
+ AARCH64_VALID_CALL_TARGET
+ mov x20, xzr
+ ret
+
+
+.globl abi_test_clobber_x21
+
+.align 4
+abi_test_clobber_x21:
+ AARCH64_VALID_CALL_TARGET
+ mov x21, xzr
+ ret
+
+
+.globl abi_test_clobber_x22
+
+.align 4
+abi_test_clobber_x22:
+ AARCH64_VALID_CALL_TARGET
+ mov x22, xzr
+ ret
+
+
+.globl abi_test_clobber_x23
+
+.align 4
+abi_test_clobber_x23:
+ AARCH64_VALID_CALL_TARGET
+ mov x23, xzr
+ ret
+
+
+.globl abi_test_clobber_x24
+
+.align 4
+abi_test_clobber_x24:
+ AARCH64_VALID_CALL_TARGET
+ mov x24, xzr
+ ret
+
+
+.globl abi_test_clobber_x25
+
+.align 4
+abi_test_clobber_x25:
+ AARCH64_VALID_CALL_TARGET
+ mov x25, xzr
+ ret
+
+
+.globl abi_test_clobber_x26
+
+.align 4
+abi_test_clobber_x26:
+ AARCH64_VALID_CALL_TARGET
+ mov x26, xzr
+ ret
+
+
+.globl abi_test_clobber_x27
+
+.align 4
+abi_test_clobber_x27:
+ AARCH64_VALID_CALL_TARGET
+ mov x27, xzr
+ ret
+
+
+.globl abi_test_clobber_x28
+
+.align 4
+abi_test_clobber_x28:
+ AARCH64_VALID_CALL_TARGET
+ mov x28, xzr
+ ret
+
+
+.globl abi_test_clobber_x29
+
+.align 4
+abi_test_clobber_x29:
+ AARCH64_VALID_CALL_TARGET
+ mov x29, xzr
+ ret
+
+
+.globl abi_test_clobber_d0
+
+.align 4
+abi_test_clobber_d0:
+ AARCH64_VALID_CALL_TARGET
+ fmov d0, xzr
+ ret
+
+
+.globl abi_test_clobber_d1
+
+.align 4
+abi_test_clobber_d1:
+ AARCH64_VALID_CALL_TARGET
+ fmov d1, xzr
+ ret
+
+
+.globl abi_test_clobber_d2
+
+.align 4
+abi_test_clobber_d2:
+ AARCH64_VALID_CALL_TARGET
+ fmov d2, xzr
+ ret
+
+
+.globl abi_test_clobber_d3
+
+.align 4
+abi_test_clobber_d3:
+ AARCH64_VALID_CALL_TARGET
+ fmov d3, xzr
+ ret
+
+
+.globl abi_test_clobber_d4
+
+.align 4
+abi_test_clobber_d4:
+ AARCH64_VALID_CALL_TARGET
+ fmov d4, xzr
+ ret
+
+
+.globl abi_test_clobber_d5
+
+.align 4
+abi_test_clobber_d5:
+ AARCH64_VALID_CALL_TARGET
+ fmov d5, xzr
+ ret
+
+
+.globl abi_test_clobber_d6
+
+.align 4
+abi_test_clobber_d6:
+ AARCH64_VALID_CALL_TARGET
+ fmov d6, xzr
+ ret
+
+
+.globl abi_test_clobber_d7
+
+.align 4
+abi_test_clobber_d7:
+ AARCH64_VALID_CALL_TARGET
+ fmov d7, xzr
+ ret
+
+
+.globl abi_test_clobber_d8
+
+.align 4
+abi_test_clobber_d8:
+ AARCH64_VALID_CALL_TARGET
+ fmov d8, xzr
+ ret
+
+
+.globl abi_test_clobber_d9
+
+.align 4
+abi_test_clobber_d9:
+ AARCH64_VALID_CALL_TARGET
+ fmov d9, xzr
+ ret
+
+
+.globl abi_test_clobber_d10
+
+.align 4
+abi_test_clobber_d10:
+ AARCH64_VALID_CALL_TARGET
+ fmov d10, xzr
+ ret
+
+
+.globl abi_test_clobber_d11
+
+.align 4
+abi_test_clobber_d11:
+ AARCH64_VALID_CALL_TARGET
+ fmov d11, xzr
+ ret
+
+
+.globl abi_test_clobber_d12
+
+.align 4
+abi_test_clobber_d12:
+ AARCH64_VALID_CALL_TARGET
+ fmov d12, xzr
+ ret
+
+
+.globl abi_test_clobber_d13
+
+.align 4
+abi_test_clobber_d13:
+ AARCH64_VALID_CALL_TARGET
+ fmov d13, xzr
+ ret
+
+
+.globl abi_test_clobber_d14
+
+.align 4
+abi_test_clobber_d14:
+ AARCH64_VALID_CALL_TARGET
+ fmov d14, xzr
+ ret
+
+
+.globl abi_test_clobber_d15
+
+.align 4
+abi_test_clobber_d15:
+ AARCH64_VALID_CALL_TARGET
+ fmov d15, xzr
+ ret
+
+
+.globl abi_test_clobber_d16
+
+.align 4
+abi_test_clobber_d16:
+ AARCH64_VALID_CALL_TARGET
+ fmov d16, xzr
+ ret
+
+
+.globl abi_test_clobber_d17
+
+.align 4
+abi_test_clobber_d17:
+ AARCH64_VALID_CALL_TARGET
+ fmov d17, xzr
+ ret
+
+
+.globl abi_test_clobber_d18
+
+.align 4
+abi_test_clobber_d18:
+ AARCH64_VALID_CALL_TARGET
+ fmov d18, xzr
+ ret
+
+
+.globl abi_test_clobber_d19
+
+.align 4
+abi_test_clobber_d19:
+ AARCH64_VALID_CALL_TARGET
+ fmov d19, xzr
+ ret
+
+
+.globl abi_test_clobber_d20
+
+.align 4
+abi_test_clobber_d20:
+ AARCH64_VALID_CALL_TARGET
+ fmov d20, xzr
+ ret
+
+
+.globl abi_test_clobber_d21
+
+.align 4
+abi_test_clobber_d21:
+ AARCH64_VALID_CALL_TARGET
+ fmov d21, xzr
+ ret
+
+
+.globl abi_test_clobber_d22
+
+.align 4
+abi_test_clobber_d22:
+ AARCH64_VALID_CALL_TARGET
+ fmov d22, xzr
+ ret
+
+
+.globl abi_test_clobber_d23
+
+.align 4
+abi_test_clobber_d23:
+ AARCH64_VALID_CALL_TARGET
+ fmov d23, xzr
+ ret
+
+
+.globl abi_test_clobber_d24
+
+.align 4
+abi_test_clobber_d24:
+ AARCH64_VALID_CALL_TARGET
+ fmov d24, xzr
+ ret
+
+
+.globl abi_test_clobber_d25
+
+.align 4
+abi_test_clobber_d25:
+ AARCH64_VALID_CALL_TARGET
+ fmov d25, xzr
+ ret
+
+
+.globl abi_test_clobber_d26
+
+.align 4
+abi_test_clobber_d26:
+ AARCH64_VALID_CALL_TARGET
+ fmov d26, xzr
+ ret
+
+
+.globl abi_test_clobber_d27
+
+.align 4
+abi_test_clobber_d27:
+ AARCH64_VALID_CALL_TARGET
+ fmov d27, xzr
+ ret
+
+
+.globl abi_test_clobber_d28
+
+.align 4
+abi_test_clobber_d28:
+ AARCH64_VALID_CALL_TARGET
+ fmov d28, xzr
+ ret
+
+
+.globl abi_test_clobber_d29
+
+.align 4
+abi_test_clobber_d29:
+ AARCH64_VALID_CALL_TARGET
+ fmov d29, xzr
+ ret
+
+
+.globl abi_test_clobber_d30
+
+.align 4
+abi_test_clobber_d30:
+ AARCH64_VALID_CALL_TARGET
+ fmov d30, xzr
+ ret
+
+
+.globl abi_test_clobber_d31
+
+.align 4
+abi_test_clobber_d31:
+ AARCH64_VALID_CALL_TARGET
+ fmov d31, xzr
+ ret
+
+
+.globl abi_test_clobber_v8_upper
+
+.align 4
+abi_test_clobber_v8_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v8.d[1], xzr
+ ret
+
+
+.globl abi_test_clobber_v9_upper
+
+.align 4
+abi_test_clobber_v9_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v9.d[1], xzr
+ ret
+
+
+.globl abi_test_clobber_v10_upper
+
+.align 4
+abi_test_clobber_v10_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v10.d[1], xzr
+ ret
+
+
+.globl abi_test_clobber_v11_upper
+
+.align 4
+abi_test_clobber_v11_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v11.d[1], xzr
+ ret
+
+
+.globl abi_test_clobber_v12_upper
+
+.align 4
+abi_test_clobber_v12_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v12.d[1], xzr
+ ret
+
+
+.globl abi_test_clobber_v13_upper
+
+.align 4
+abi_test_clobber_v13_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v13.d[1], xzr
+ ret
+
+
+.globl abi_test_clobber_v14_upper
+
+.align 4
+abi_test_clobber_v14_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v14.d[1], xzr
+ ret
+
+
+.globl abi_test_clobber_v15_upper
+
+.align 4
+abi_test_clobber_v15_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v15.d[1], xzr
+ ret
+
+#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
diff --git a/gen/test_support/trampoline-x86-apple.S b/gen/test_support/trampoline-x86-apple.S
new file mode 100644
index 0000000..4065b9a
--- /dev/null
+++ b/gen/test_support/trampoline-x86-apple.S
@@ -0,0 +1,168 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
+.text
+.globl _abi_test_trampoline
+.private_extern _abi_test_trampoline
+.align 4
+_abi_test_trampoline:
+L_abi_test_trampoline_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 24(%esp),%ecx
+ movl (%ecx),%esi
+ movl 4(%ecx),%edi
+ movl 8(%ecx),%ebx
+ movl 12(%ecx),%ebp
+ subl $44,%esp
+ movl 72(%esp),%eax
+ xorl %ecx,%ecx
+L000loop:
+ cmpl 76(%esp),%ecx
+ jae L001loop_done
+ movl (%eax,%ecx,4),%edx
+ movl %edx,(%esp,%ecx,4)
+ addl $1,%ecx
+ jmp L000loop
+L001loop_done:
+ call *64(%esp)
+ addl $44,%esp
+ movl 24(%esp),%ecx
+ movl %esi,(%ecx)
+ movl %edi,4(%ecx)
+ movl %ebx,8(%ecx)
+ movl %ebp,12(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.globl _abi_test_get_and_clear_direction_flag
+.private_extern _abi_test_get_and_clear_direction_flag
+.align 4
+_abi_test_get_and_clear_direction_flag:
+L_abi_test_get_and_clear_direction_flag_begin:
+ pushfl
+ popl %eax
+ andl $1024,%eax
+ shrl $10,%eax
+ cld
+ ret
+.globl _abi_test_set_direction_flag
+.private_extern _abi_test_set_direction_flag
+.align 4
+_abi_test_set_direction_flag:
+L_abi_test_set_direction_flag_begin:
+ std
+ ret
+.globl _abi_test_clobber_eax
+.private_extern _abi_test_clobber_eax
+.align 4
+_abi_test_clobber_eax:
+L_abi_test_clobber_eax_begin:
+ xorl %eax,%eax
+ ret
+.globl _abi_test_clobber_ebx
+.private_extern _abi_test_clobber_ebx
+.align 4
+_abi_test_clobber_ebx:
+L_abi_test_clobber_ebx_begin:
+ xorl %ebx,%ebx
+ ret
+.globl _abi_test_clobber_ecx
+.private_extern _abi_test_clobber_ecx
+.align 4
+_abi_test_clobber_ecx:
+L_abi_test_clobber_ecx_begin:
+ xorl %ecx,%ecx
+ ret
+.globl _abi_test_clobber_edx
+.private_extern _abi_test_clobber_edx
+.align 4
+_abi_test_clobber_edx:
+L_abi_test_clobber_edx_begin:
+ xorl %edx,%edx
+ ret
+.globl _abi_test_clobber_edi
+.private_extern _abi_test_clobber_edi
+.align 4
+_abi_test_clobber_edi:
+L_abi_test_clobber_edi_begin:
+ xorl %edi,%edi
+ ret
+.globl _abi_test_clobber_esi
+.private_extern _abi_test_clobber_esi
+.align 4
+_abi_test_clobber_esi:
+L_abi_test_clobber_esi_begin:
+ xorl %esi,%esi
+ ret
+.globl _abi_test_clobber_ebp
+.private_extern _abi_test_clobber_ebp
+.align 4
+_abi_test_clobber_ebp:
+L_abi_test_clobber_ebp_begin:
+ xorl %ebp,%ebp
+ ret
+.globl _abi_test_clobber_xmm0
+.private_extern _abi_test_clobber_xmm0
+.align 4
+_abi_test_clobber_xmm0:
+L_abi_test_clobber_xmm0_begin:
+ pxor %xmm0,%xmm0
+ ret
+.globl _abi_test_clobber_xmm1
+.private_extern _abi_test_clobber_xmm1
+.align 4
+_abi_test_clobber_xmm1:
+L_abi_test_clobber_xmm1_begin:
+ pxor %xmm1,%xmm1
+ ret
+.globl _abi_test_clobber_xmm2
+.private_extern _abi_test_clobber_xmm2
+.align 4
+_abi_test_clobber_xmm2:
+L_abi_test_clobber_xmm2_begin:
+ pxor %xmm2,%xmm2
+ ret
+.globl _abi_test_clobber_xmm3
+.private_extern _abi_test_clobber_xmm3
+.align 4
+_abi_test_clobber_xmm3:
+L_abi_test_clobber_xmm3_begin:
+ pxor %xmm3,%xmm3
+ ret
+.globl _abi_test_clobber_xmm4
+.private_extern _abi_test_clobber_xmm4
+.align 4
+_abi_test_clobber_xmm4:
+L_abi_test_clobber_xmm4_begin:
+ pxor %xmm4,%xmm4
+ ret
+.globl _abi_test_clobber_xmm5
+.private_extern _abi_test_clobber_xmm5
+.align 4
+_abi_test_clobber_xmm5:
+L_abi_test_clobber_xmm5_begin:
+ pxor %xmm5,%xmm5
+ ret
+.globl _abi_test_clobber_xmm6
+.private_extern _abi_test_clobber_xmm6
+.align 4
+_abi_test_clobber_xmm6:
+L_abi_test_clobber_xmm6_begin:
+ pxor %xmm6,%xmm6
+ ret
+.globl _abi_test_clobber_xmm7
+.private_extern _abi_test_clobber_xmm7
+.align 4
+_abi_test_clobber_xmm7:
+L_abi_test_clobber_xmm7_begin:
+ pxor %xmm7,%xmm7
+ ret
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__)
diff --git a/gen/test_support/trampoline-x86-linux.S b/gen/test_support/trampoline-x86-linux.S
new file mode 100644
index 0000000..3452c63
--- /dev/null
+++ b/gen/test_support/trampoline-x86-linux.S
@@ -0,0 +1,204 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
+.text
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.type abi_test_trampoline,@function
+.align 16
+abi_test_trampoline:
+.L_abi_test_trampoline_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 24(%esp),%ecx
+ movl (%ecx),%esi
+ movl 4(%ecx),%edi
+ movl 8(%ecx),%ebx
+ movl 12(%ecx),%ebp
+ subl $44,%esp
+ movl 72(%esp),%eax
+ xorl %ecx,%ecx
+.L000loop:
+ cmpl 76(%esp),%ecx
+ jae .L001loop_done
+ movl (%eax,%ecx,4),%edx
+ movl %edx,(%esp,%ecx,4)
+ addl $1,%ecx
+ jmp .L000loop
+.L001loop_done:
+ call *64(%esp)
+ addl $44,%esp
+ movl 24(%esp),%ecx
+ movl %esi,(%ecx)
+ movl %edi,4(%ecx)
+ movl %ebx,8(%ecx)
+ movl %ebp,12(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size abi_test_trampoline,.-.L_abi_test_trampoline_begin
+.globl abi_test_get_and_clear_direction_flag
+.hidden abi_test_get_and_clear_direction_flag
+.type abi_test_get_and_clear_direction_flag,@function
+.align 16
+abi_test_get_and_clear_direction_flag:
+.L_abi_test_get_and_clear_direction_flag_begin:
+ pushfl
+ popl %eax
+ andl $1024,%eax
+ shrl $10,%eax
+ cld
+ ret
+.size abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin
+.globl abi_test_set_direction_flag
+.hidden abi_test_set_direction_flag
+.type abi_test_set_direction_flag,@function
+.align 16
+abi_test_set_direction_flag:
+.L_abi_test_set_direction_flag_begin:
+ std
+ ret
+.size abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin
+.globl abi_test_clobber_eax
+.hidden abi_test_clobber_eax
+.type abi_test_clobber_eax,@function
+.align 16
+abi_test_clobber_eax:
+.L_abi_test_clobber_eax_begin:
+ xorl %eax,%eax
+ ret
+.size abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin
+.globl abi_test_clobber_ebx
+.hidden abi_test_clobber_ebx
+.type abi_test_clobber_ebx,@function
+.align 16
+abi_test_clobber_ebx:
+.L_abi_test_clobber_ebx_begin:
+ xorl %ebx,%ebx
+ ret
+.size abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin
+.globl abi_test_clobber_ecx
+.hidden abi_test_clobber_ecx
+.type abi_test_clobber_ecx,@function
+.align 16
+abi_test_clobber_ecx:
+.L_abi_test_clobber_ecx_begin:
+ xorl %ecx,%ecx
+ ret
+.size abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin
+.globl abi_test_clobber_edx
+.hidden abi_test_clobber_edx
+.type abi_test_clobber_edx,@function
+.align 16
+abi_test_clobber_edx:
+.L_abi_test_clobber_edx_begin:
+ xorl %edx,%edx
+ ret
+.size abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin
+.globl abi_test_clobber_edi
+.hidden abi_test_clobber_edi
+.type abi_test_clobber_edi,@function
+.align 16
+abi_test_clobber_edi:
+.L_abi_test_clobber_edi_begin:
+ xorl %edi,%edi
+ ret
+.size abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin
+.globl abi_test_clobber_esi
+.hidden abi_test_clobber_esi
+.type abi_test_clobber_esi,@function
+.align 16
+abi_test_clobber_esi:
+.L_abi_test_clobber_esi_begin:
+ xorl %esi,%esi
+ ret
+.size abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin
+.globl abi_test_clobber_ebp
+.hidden abi_test_clobber_ebp
+.type abi_test_clobber_ebp,@function
+.align 16
+abi_test_clobber_ebp:
+.L_abi_test_clobber_ebp_begin:
+ xorl %ebp,%ebp
+ ret
+.size abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin
+.globl abi_test_clobber_xmm0
+.hidden abi_test_clobber_xmm0
+.type abi_test_clobber_xmm0,@function
+.align 16
+abi_test_clobber_xmm0:
+.L_abi_test_clobber_xmm0_begin:
+ pxor %xmm0,%xmm0
+ ret
+.size abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin
+.globl abi_test_clobber_xmm1
+.hidden abi_test_clobber_xmm1
+.type abi_test_clobber_xmm1,@function
+.align 16
+abi_test_clobber_xmm1:
+.L_abi_test_clobber_xmm1_begin:
+ pxor %xmm1,%xmm1
+ ret
+.size abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin
+.globl abi_test_clobber_xmm2
+.hidden abi_test_clobber_xmm2
+.type abi_test_clobber_xmm2,@function
+.align 16
+abi_test_clobber_xmm2:
+.L_abi_test_clobber_xmm2_begin:
+ pxor %xmm2,%xmm2
+ ret
+.size abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin
+.globl abi_test_clobber_xmm3
+.hidden abi_test_clobber_xmm3
+.type abi_test_clobber_xmm3,@function
+.align 16
+abi_test_clobber_xmm3:
+.L_abi_test_clobber_xmm3_begin:
+ pxor %xmm3,%xmm3
+ ret
+.size abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin
+.globl abi_test_clobber_xmm4
+.hidden abi_test_clobber_xmm4
+.type abi_test_clobber_xmm4,@function
+.align 16
+abi_test_clobber_xmm4:
+.L_abi_test_clobber_xmm4_begin:
+ pxor %xmm4,%xmm4
+ ret
+.size abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin
+.globl abi_test_clobber_xmm5
+.hidden abi_test_clobber_xmm5
+.type abi_test_clobber_xmm5,@function
+.align 16
+abi_test_clobber_xmm5:
+.L_abi_test_clobber_xmm5_begin:
+ pxor %xmm5,%xmm5
+ ret
+.size abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin
+.globl abi_test_clobber_xmm6
+.hidden abi_test_clobber_xmm6
+.type abi_test_clobber_xmm6,@function
+.align 16
+abi_test_clobber_xmm6:
+.L_abi_test_clobber_xmm6_begin:
+ pxor %xmm6,%xmm6
+ ret
+.size abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin
+.globl abi_test_clobber_xmm7
+.hidden abi_test_clobber_xmm7
+.type abi_test_clobber_xmm7,@function
+.align 16
+abi_test_clobber_xmm7:
+.L_abi_test_clobber_xmm7_begin:
+ pxor %xmm7,%xmm7
+ ret
+.size abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin
+#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__)
diff --git a/gen/test_support/trampoline-x86-win.asm b/gen/test_support/trampoline-x86-win.asm
new file mode 100644
index 0000000..3ef9917
--- /dev/null
+++ b/gen/test_support/trampoline-x86-win.asm
@@ -0,0 +1,161 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+%ifidn __OUTPUT_FORMAT__, win32
+%ifidn __OUTPUT_FORMAT__,obj
+section code use32 class=code align=64
+%elifidn __OUTPUT_FORMAT__,win32
+$@feat.00 equ 1
+section .text code align=64
+%else
+section .text code
+%endif
+global _abi_test_trampoline
+align 16
+_abi_test_trampoline:
+L$_abi_test_trampoline_begin:
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ecx,DWORD [24+esp]
+ mov esi,DWORD [ecx]
+ mov edi,DWORD [4+ecx]
+ mov ebx,DWORD [8+ecx]
+ mov ebp,DWORD [12+ecx]
+ sub esp,44
+ mov eax,DWORD [72+esp]
+ xor ecx,ecx
+L$000loop:
+ cmp ecx,DWORD [76+esp]
+ jae NEAR L$001loop_done
+ mov edx,DWORD [ecx*4+eax]
+ mov DWORD [ecx*4+esp],edx
+ add ecx,1
+ jmp NEAR L$000loop
+L$001loop_done:
+ call DWORD [64+esp]
+ add esp,44
+ mov ecx,DWORD [24+esp]
+ mov DWORD [ecx],esi
+ mov DWORD [4+ecx],edi
+ mov DWORD [8+ecx],ebx
+ mov DWORD [12+ecx],ebp
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+global _abi_test_get_and_clear_direction_flag
+align 16
+_abi_test_get_and_clear_direction_flag:
+L$_abi_test_get_and_clear_direction_flag_begin:
+ pushfd
+ pop eax
+ and eax,1024
+ shr eax,10
+ cld
+ ret
+global _abi_test_set_direction_flag
+align 16
+_abi_test_set_direction_flag:
+L$_abi_test_set_direction_flag_begin:
+ std
+ ret
+global _abi_test_clobber_eax
+align 16
+_abi_test_clobber_eax:
+L$_abi_test_clobber_eax_begin:
+ xor eax,eax
+ ret
+global _abi_test_clobber_ebx
+align 16
+_abi_test_clobber_ebx:
+L$_abi_test_clobber_ebx_begin:
+ xor ebx,ebx
+ ret
+global _abi_test_clobber_ecx
+align 16
+_abi_test_clobber_ecx:
+L$_abi_test_clobber_ecx_begin:
+ xor ecx,ecx
+ ret
+global _abi_test_clobber_edx
+align 16
+_abi_test_clobber_edx:
+L$_abi_test_clobber_edx_begin:
+ xor edx,edx
+ ret
+global _abi_test_clobber_edi
+align 16
+_abi_test_clobber_edi:
+L$_abi_test_clobber_edi_begin:
+ xor edi,edi
+ ret
+global _abi_test_clobber_esi
+align 16
+_abi_test_clobber_esi:
+L$_abi_test_clobber_esi_begin:
+ xor esi,esi
+ ret
+global _abi_test_clobber_ebp
+align 16
+_abi_test_clobber_ebp:
+L$_abi_test_clobber_ebp_begin:
+ xor ebp,ebp
+ ret
+global _abi_test_clobber_xmm0
+align 16
+_abi_test_clobber_xmm0:
+L$_abi_test_clobber_xmm0_begin:
+ pxor xmm0,xmm0
+ ret
+global _abi_test_clobber_xmm1
+align 16
+_abi_test_clobber_xmm1:
+L$_abi_test_clobber_xmm1_begin:
+ pxor xmm1,xmm1
+ ret
+global _abi_test_clobber_xmm2
+align 16
+_abi_test_clobber_xmm2:
+L$_abi_test_clobber_xmm2_begin:
+ pxor xmm2,xmm2
+ ret
+global _abi_test_clobber_xmm3
+align 16
+_abi_test_clobber_xmm3:
+L$_abi_test_clobber_xmm3_begin:
+ pxor xmm3,xmm3
+ ret
+global _abi_test_clobber_xmm4
+align 16
+_abi_test_clobber_xmm4:
+L$_abi_test_clobber_xmm4_begin:
+ pxor xmm4,xmm4
+ ret
+global _abi_test_clobber_xmm5
+align 16
+_abi_test_clobber_xmm5:
+L$_abi_test_clobber_xmm5_begin:
+ pxor xmm5,xmm5
+ ret
+global _abi_test_clobber_xmm6
+align 16
+_abi_test_clobber_xmm6:
+L$_abi_test_clobber_xmm6_begin:
+ pxor xmm6,xmm6
+ ret
+global _abi_test_clobber_xmm7
+align 16
+_abi_test_clobber_xmm7:
+L$_abi_test_clobber_xmm7_begin:
+ pxor xmm7,xmm7
+ ret
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/gen/test_support/trampoline-x86_64-apple.S b/gen/test_support/trampoline-x86_64-apple.S
new file mode 100644
index 0000000..7c76d2d
--- /dev/null
+++ b/gen/test_support/trampoline-x86_64-apple.S
@@ -0,0 +1,541 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
+.text
+
+
+
+
+
+
+
+
+
+.globl _abi_test_trampoline
+.private_extern _abi_test_trampoline
+.p2align 4
+_abi_test_trampoline:
+
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+ subq $120,%rsp
+
+
+ movq %r8,48(%rsp)
+ movq %rbx,64(%rsp)
+
+
+ movq %rbp,72(%rsp)
+
+
+ movq %r12,80(%rsp)
+
+
+ movq %r13,88(%rsp)
+
+
+ movq %r14,96(%rsp)
+
+
+ movq %r15,104(%rsp)
+
+
+ movq 0(%rsi),%rbx
+ movq 8(%rsi),%rbp
+ movq 16(%rsi),%r12
+ movq 24(%rsi),%r13
+ movq 32(%rsi),%r14
+ movq 40(%rsi),%r15
+
+ movq %rdi,32(%rsp)
+ movq %rsi,40(%rsp)
+
+
+
+
+ movq %rdx,%r10
+ movq %rcx,%r11
+ decq %r11
+ js L$args_done
+ movq (%r10),%rdi
+ addq $8,%r10
+ decq %r11
+ js L$args_done
+ movq (%r10),%rsi
+ addq $8,%r10
+ decq %r11
+ js L$args_done
+ movq (%r10),%rdx
+ addq $8,%r10
+ decq %r11
+ js L$args_done
+ movq (%r10),%rcx
+ addq $8,%r10
+ decq %r11
+ js L$args_done
+ movq (%r10),%r8
+ addq $8,%r10
+ decq %r11
+ js L$args_done
+ movq (%r10),%r9
+ addq $8,%r10
+ leaq 0(%rsp),%rax
+L$args_loop:
+ decq %r11
+ js L$args_done
+
+
+
+
+
+
+ movq %r11,56(%rsp)
+ movq (%r10),%r11
+ movq %r11,(%rax)
+ movq 56(%rsp),%r11
+
+ addq $8,%r10
+ addq $8,%rax
+ jmp L$args_loop
+
+L$args_done:
+ movq 32(%rsp),%rax
+ movq 48(%rsp),%r10
+ testq %r10,%r10
+ jz L$no_unwind
+
+
+ pushfq
+ orq $0x100,0(%rsp)
+ popfq
+
+
+
+ nop
+.globl _abi_test_unwind_start
+.private_extern _abi_test_unwind_start
+_abi_test_unwind_start:
+
+ call *%rax
+.globl _abi_test_unwind_return
+.private_extern _abi_test_unwind_return
+_abi_test_unwind_return:
+
+
+
+
+ pushfq
+ andq $-0x101,0(%rsp)
+ popfq
+.globl _abi_test_unwind_stop
+.private_extern _abi_test_unwind_stop
+_abi_test_unwind_stop:
+
+ jmp L$call_done
+
+L$no_unwind:
+ call *%rax
+
+L$call_done:
+
+ movq 40(%rsp),%rsi
+ movq %rbx,0(%rsi)
+ movq %rbp,8(%rsi)
+ movq %r12,16(%rsi)
+ movq %r13,24(%rsi)
+ movq %r14,32(%rsi)
+ movq %r15,40(%rsi)
+ movq 64(%rsp),%rbx
+
+ movq 72(%rsp),%rbp
+
+ movq 80(%rsp),%r12
+
+ movq 88(%rsp),%r13
+
+ movq 96(%rsp),%r14
+
+ movq 104(%rsp),%r15
+
+ addq $120,%rsp
+
+
+
+ ret
+
+
+
+
+.globl _abi_test_clobber_rax
+.private_extern _abi_test_clobber_rax
+.p2align 4
+_abi_test_clobber_rax:
+_CET_ENDBR
+ xorq %rax,%rax
+ ret
+
+
+.globl _abi_test_clobber_rbx
+.private_extern _abi_test_clobber_rbx
+.p2align 4
+_abi_test_clobber_rbx:
+_CET_ENDBR
+ xorq %rbx,%rbx
+ ret
+
+
+.globl _abi_test_clobber_rcx
+.private_extern _abi_test_clobber_rcx
+.p2align 4
+_abi_test_clobber_rcx:
+_CET_ENDBR
+ xorq %rcx,%rcx
+ ret
+
+
+.globl _abi_test_clobber_rdx
+.private_extern _abi_test_clobber_rdx
+.p2align 4
+_abi_test_clobber_rdx:
+_CET_ENDBR
+ xorq %rdx,%rdx
+ ret
+
+
+.globl _abi_test_clobber_rdi
+.private_extern _abi_test_clobber_rdi
+.p2align 4
+_abi_test_clobber_rdi:
+_CET_ENDBR
+ xorq %rdi,%rdi
+ ret
+
+
+.globl _abi_test_clobber_rsi
+.private_extern _abi_test_clobber_rsi
+.p2align 4
+_abi_test_clobber_rsi:
+_CET_ENDBR
+ xorq %rsi,%rsi
+ ret
+
+
+.globl _abi_test_clobber_rbp
+.private_extern _abi_test_clobber_rbp
+.p2align 4
+_abi_test_clobber_rbp:
+_CET_ENDBR
+ xorq %rbp,%rbp
+ ret
+
+
+.globl _abi_test_clobber_r8
+.private_extern _abi_test_clobber_r8
+.p2align 4
+_abi_test_clobber_r8:
+_CET_ENDBR
+ xorq %r8,%r8
+ ret
+
+
+.globl _abi_test_clobber_r9
+.private_extern _abi_test_clobber_r9
+.p2align 4
+_abi_test_clobber_r9:
+_CET_ENDBR
+ xorq %r9,%r9
+ ret
+
+
+.globl _abi_test_clobber_r10
+.private_extern _abi_test_clobber_r10
+.p2align 4
+_abi_test_clobber_r10:
+_CET_ENDBR
+ xorq %r10,%r10
+ ret
+
+
+.globl _abi_test_clobber_r11
+.private_extern _abi_test_clobber_r11
+.p2align 4
+_abi_test_clobber_r11:
+_CET_ENDBR
+ xorq %r11,%r11
+ ret
+
+
+.globl _abi_test_clobber_r12
+.private_extern _abi_test_clobber_r12
+.p2align 4
+_abi_test_clobber_r12:
+_CET_ENDBR
+ xorq %r12,%r12
+ ret
+
+
+.globl _abi_test_clobber_r13
+.private_extern _abi_test_clobber_r13
+.p2align 4
+_abi_test_clobber_r13:
+_CET_ENDBR
+ xorq %r13,%r13
+ ret
+
+
+.globl _abi_test_clobber_r14
+.private_extern _abi_test_clobber_r14
+.p2align 4
+_abi_test_clobber_r14:
+_CET_ENDBR
+ xorq %r14,%r14
+ ret
+
+
+.globl _abi_test_clobber_r15
+.private_extern _abi_test_clobber_r15
+.p2align 4
+_abi_test_clobber_r15:
+_CET_ENDBR
+ xorq %r15,%r15
+ ret
+
+
+.globl _abi_test_clobber_xmm0
+.private_extern _abi_test_clobber_xmm0
+.p2align 4
+_abi_test_clobber_xmm0:
+_CET_ENDBR
+ pxor %xmm0,%xmm0
+ ret
+
+
+.globl _abi_test_clobber_xmm1
+.private_extern _abi_test_clobber_xmm1
+.p2align 4
+_abi_test_clobber_xmm1:
+_CET_ENDBR
+ pxor %xmm1,%xmm1
+ ret
+
+
+.globl _abi_test_clobber_xmm2
+.private_extern _abi_test_clobber_xmm2
+.p2align 4
+_abi_test_clobber_xmm2:
+_CET_ENDBR
+ pxor %xmm2,%xmm2
+ ret
+
+
+.globl _abi_test_clobber_xmm3
+.private_extern _abi_test_clobber_xmm3
+.p2align 4
+_abi_test_clobber_xmm3:
+_CET_ENDBR
+ pxor %xmm3,%xmm3
+ ret
+
+
+.globl _abi_test_clobber_xmm4
+.private_extern _abi_test_clobber_xmm4
+.p2align 4
+_abi_test_clobber_xmm4:
+_CET_ENDBR
+ pxor %xmm4,%xmm4
+ ret
+
+
+.globl _abi_test_clobber_xmm5
+.private_extern _abi_test_clobber_xmm5
+.p2align 4
+_abi_test_clobber_xmm5:
+_CET_ENDBR
+ pxor %xmm5,%xmm5
+ ret
+
+
+.globl _abi_test_clobber_xmm6
+.private_extern _abi_test_clobber_xmm6
+.p2align 4
+_abi_test_clobber_xmm6:
+_CET_ENDBR
+ pxor %xmm6,%xmm6
+ ret
+
+
+.globl _abi_test_clobber_xmm7
+.private_extern _abi_test_clobber_xmm7
+.p2align 4
+_abi_test_clobber_xmm7:
+_CET_ENDBR
+ pxor %xmm7,%xmm7
+ ret
+
+
+.globl _abi_test_clobber_xmm8
+.private_extern _abi_test_clobber_xmm8
+.p2align 4
+_abi_test_clobber_xmm8:
+_CET_ENDBR
+ pxor %xmm8,%xmm8
+ ret
+
+
+.globl _abi_test_clobber_xmm9
+.private_extern _abi_test_clobber_xmm9
+.p2align 4
+_abi_test_clobber_xmm9:
+_CET_ENDBR
+ pxor %xmm9,%xmm9
+ ret
+
+
+.globl _abi_test_clobber_xmm10
+.private_extern _abi_test_clobber_xmm10
+.p2align 4
+_abi_test_clobber_xmm10:
+_CET_ENDBR
+ pxor %xmm10,%xmm10
+ ret
+
+
+.globl _abi_test_clobber_xmm11
+.private_extern _abi_test_clobber_xmm11
+.p2align 4
+_abi_test_clobber_xmm11:
+_CET_ENDBR
+ pxor %xmm11,%xmm11
+ ret
+
+
+.globl _abi_test_clobber_xmm12
+.private_extern _abi_test_clobber_xmm12
+.p2align 4
+_abi_test_clobber_xmm12:
+_CET_ENDBR
+ pxor %xmm12,%xmm12
+ ret
+
+
+.globl _abi_test_clobber_xmm13
+.private_extern _abi_test_clobber_xmm13
+.p2align 4
+_abi_test_clobber_xmm13:
+_CET_ENDBR
+ pxor %xmm13,%xmm13
+ ret
+
+
+.globl _abi_test_clobber_xmm14
+.private_extern _abi_test_clobber_xmm14
+.p2align 4
+_abi_test_clobber_xmm14:
+_CET_ENDBR
+ pxor %xmm14,%xmm14
+ ret
+
+
+.globl _abi_test_clobber_xmm15
+.private_extern _abi_test_clobber_xmm15
+.p2align 4
+_abi_test_clobber_xmm15:
+_CET_ENDBR
+ pxor %xmm15,%xmm15
+ ret
+
+
+
+
+
+.globl _abi_test_bad_unwind_wrong_register
+.private_extern _abi_test_bad_unwind_wrong_register
+.p2align 4
+_abi_test_bad_unwind_wrong_register:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+
+
+
+ nop
+ popq %r12
+
+ ret
+
+
+
+
+
+
+
+
+.globl _abi_test_bad_unwind_temporary
+.private_extern _abi_test_bad_unwind_temporary
+.p2align 4
+_abi_test_bad_unwind_temporary:
+
+
+_CET_ENDBR
+ pushq %r12
+
+
+
+ movq %r12,%rax
+ incq %rax
+ movq %rax,(%rsp)
+
+
+
+ movq %r12,(%rsp)
+
+
+ popq %r12
+
+ ret
+
+
+
+
+
+
+
+
+.globl _abi_test_get_and_clear_direction_flag
+.private_extern _abi_test_get_and_clear_direction_flag
+_abi_test_get_and_clear_direction_flag:
+_CET_ENDBR
+ pushfq
+ popq %rax
+ andq $0x400,%rax
+ shrq $10,%rax
+ cld
+ ret
+
+
+
+
+
+.globl _abi_test_set_direction_flag
+.private_extern _abi_test_set_direction_flag
+_abi_test_set_direction_flag:
+_CET_ENDBR
+ std
+ ret
+
+#endif
diff --git a/gen/test_support/trampoline-x86_64-linux.S b/gen/test_support/trampoline-x86_64-linux.S
new file mode 100644
index 0000000..93af8b9
--- /dev/null
+++ b/gen/test_support/trampoline-x86_64-linux.S
@@ -0,0 +1,545 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
+.text
+
+
+
+
+
+
+
+
+.type abi_test_trampoline, @function
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.align 16
+abi_test_trampoline:
+.cfi_startproc
+
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+ subq $120,%rsp
+.cfi_adjust_cfa_offset 120
+
+ movq %r8,48(%rsp)
+ movq %rbx,64(%rsp)
+.cfi_offset rbx, -64
+
+ movq %rbp,72(%rsp)
+.cfi_offset rbp, -56
+
+ movq %r12,80(%rsp)
+.cfi_offset r12, -48
+
+ movq %r13,88(%rsp)
+.cfi_offset r13, -40
+
+ movq %r14,96(%rsp)
+.cfi_offset r14, -32
+
+ movq %r15,104(%rsp)
+.cfi_offset r15, -24
+
+ movq 0(%rsi),%rbx
+ movq 8(%rsi),%rbp
+ movq 16(%rsi),%r12
+ movq 24(%rsi),%r13
+ movq 32(%rsi),%r14
+ movq 40(%rsi),%r15
+
+ movq %rdi,32(%rsp)
+ movq %rsi,40(%rsp)
+
+
+
+
+ movq %rdx,%r10
+ movq %rcx,%r11
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rdi
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rsi
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rdx
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rcx
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%r8
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%r9
+ addq $8,%r10
+ leaq 0(%rsp),%rax
+.Largs_loop:
+ decq %r11
+ js .Largs_done
+
+
+
+
+
+
+ movq %r11,56(%rsp)
+ movq (%r10),%r11
+ movq %r11,(%rax)
+ movq 56(%rsp),%r11
+
+ addq $8,%r10
+ addq $8,%rax
+ jmp .Largs_loop
+
+.Largs_done:
+ movq 32(%rsp),%rax
+ movq 48(%rsp),%r10
+ testq %r10,%r10
+ jz .Lno_unwind
+
+
+ pushfq
+ orq $0x100,0(%rsp)
+ popfq
+
+
+
+ nop
+.globl abi_test_unwind_start
+.hidden abi_test_unwind_start
+abi_test_unwind_start:
+
+ call *%rax
+.globl abi_test_unwind_return
+.hidden abi_test_unwind_return
+abi_test_unwind_return:
+
+
+
+
+ pushfq
+ andq $-0x101,0(%rsp)
+ popfq
+.globl abi_test_unwind_stop
+.hidden abi_test_unwind_stop
+abi_test_unwind_stop:
+
+ jmp .Lcall_done
+
+.Lno_unwind:
+ call *%rax
+
+.Lcall_done:
+
+ movq 40(%rsp),%rsi
+ movq %rbx,0(%rsi)
+ movq %rbp,8(%rsi)
+ movq %r12,16(%rsi)
+ movq %r13,24(%rsi)
+ movq %r14,32(%rsi)
+ movq %r15,40(%rsi)
+ movq 64(%rsp),%rbx
+.cfi_restore rbx
+ movq 72(%rsp),%rbp
+.cfi_restore rbp
+ movq 80(%rsp),%r12
+.cfi_restore r12
+ movq 88(%rsp),%r13
+.cfi_restore r13
+ movq 96(%rsp),%r14
+.cfi_restore r14
+ movq 104(%rsp),%r15
+.cfi_restore r15
+ addq $120,%rsp
+.cfi_adjust_cfa_offset -120
+
+
+ ret
+.cfi_endproc
+
+.size abi_test_trampoline,.-abi_test_trampoline
+.type abi_test_clobber_rax, @function
+.globl abi_test_clobber_rax
+.hidden abi_test_clobber_rax
+.align 16
+abi_test_clobber_rax:
+_CET_ENDBR
+ xorq %rax,%rax
+ ret
+.size abi_test_clobber_rax,.-abi_test_clobber_rax
+.type abi_test_clobber_rbx, @function
+.globl abi_test_clobber_rbx
+.hidden abi_test_clobber_rbx
+.align 16
+abi_test_clobber_rbx:
+_CET_ENDBR
+ xorq %rbx,%rbx
+ ret
+.size abi_test_clobber_rbx,.-abi_test_clobber_rbx
+.type abi_test_clobber_rcx, @function
+.globl abi_test_clobber_rcx
+.hidden abi_test_clobber_rcx
+.align 16
+abi_test_clobber_rcx:
+_CET_ENDBR
+ xorq %rcx,%rcx
+ ret
+.size abi_test_clobber_rcx,.-abi_test_clobber_rcx
+.type abi_test_clobber_rdx, @function
+.globl abi_test_clobber_rdx
+.hidden abi_test_clobber_rdx
+.align 16
+abi_test_clobber_rdx:
+_CET_ENDBR
+ xorq %rdx,%rdx
+ ret
+.size abi_test_clobber_rdx,.-abi_test_clobber_rdx
+.type abi_test_clobber_rdi, @function
+.globl abi_test_clobber_rdi
+.hidden abi_test_clobber_rdi
+.align 16
+abi_test_clobber_rdi:
+_CET_ENDBR
+ xorq %rdi,%rdi
+ ret
+.size abi_test_clobber_rdi,.-abi_test_clobber_rdi
+.type abi_test_clobber_rsi, @function
+.globl abi_test_clobber_rsi
+.hidden abi_test_clobber_rsi
+.align 16
+abi_test_clobber_rsi:
+_CET_ENDBR
+ xorq %rsi,%rsi
+ ret
+.size abi_test_clobber_rsi,.-abi_test_clobber_rsi
+.type abi_test_clobber_rbp, @function
+.globl abi_test_clobber_rbp
+.hidden abi_test_clobber_rbp
+.align 16
+abi_test_clobber_rbp:
+_CET_ENDBR
+ xorq %rbp,%rbp
+ ret
+.size abi_test_clobber_rbp,.-abi_test_clobber_rbp
+.type abi_test_clobber_r8, @function
+.globl abi_test_clobber_r8
+.hidden abi_test_clobber_r8
+.align 16
+abi_test_clobber_r8:
+_CET_ENDBR
+ xorq %r8,%r8
+ ret
+.size abi_test_clobber_r8,.-abi_test_clobber_r8
+.type abi_test_clobber_r9, @function
+.globl abi_test_clobber_r9
+.hidden abi_test_clobber_r9
+.align 16
+abi_test_clobber_r9:
+_CET_ENDBR
+ xorq %r9,%r9
+ ret
+.size abi_test_clobber_r9,.-abi_test_clobber_r9
+.type abi_test_clobber_r10, @function
+.globl abi_test_clobber_r10
+.hidden abi_test_clobber_r10
+.align 16
+abi_test_clobber_r10:
+_CET_ENDBR
+ xorq %r10,%r10
+ ret
+.size abi_test_clobber_r10,.-abi_test_clobber_r10
+.type abi_test_clobber_r11, @function
+.globl abi_test_clobber_r11
+.hidden abi_test_clobber_r11
+.align 16
+abi_test_clobber_r11:
+_CET_ENDBR
+ xorq %r11,%r11
+ ret
+.size abi_test_clobber_r11,.-abi_test_clobber_r11
+.type abi_test_clobber_r12, @function
+.globl abi_test_clobber_r12
+.hidden abi_test_clobber_r12
+.align 16
+abi_test_clobber_r12:
+_CET_ENDBR
+ xorq %r12,%r12
+ ret
+.size abi_test_clobber_r12,.-abi_test_clobber_r12
+.type abi_test_clobber_r13, @function
+.globl abi_test_clobber_r13
+.hidden abi_test_clobber_r13
+.align 16
+abi_test_clobber_r13:
+_CET_ENDBR
+ xorq %r13,%r13
+ ret
+.size abi_test_clobber_r13,.-abi_test_clobber_r13
+.type abi_test_clobber_r14, @function
+.globl abi_test_clobber_r14
+.hidden abi_test_clobber_r14
+.align 16
+abi_test_clobber_r14:
+_CET_ENDBR
+ xorq %r14,%r14
+ ret
+.size abi_test_clobber_r14,.-abi_test_clobber_r14
+.type abi_test_clobber_r15, @function
+.globl abi_test_clobber_r15
+.hidden abi_test_clobber_r15
+.align 16
+abi_test_clobber_r15:
+_CET_ENDBR
+ xorq %r15,%r15
+ ret
+.size abi_test_clobber_r15,.-abi_test_clobber_r15
+.type abi_test_clobber_xmm0, @function
+.globl abi_test_clobber_xmm0
+.hidden abi_test_clobber_xmm0
+.align 16
+abi_test_clobber_xmm0:
+_CET_ENDBR
+ pxor %xmm0,%xmm0
+ ret
+.size abi_test_clobber_xmm0,.-abi_test_clobber_xmm0
+.type abi_test_clobber_xmm1, @function
+.globl abi_test_clobber_xmm1
+.hidden abi_test_clobber_xmm1
+.align 16
+abi_test_clobber_xmm1:
+_CET_ENDBR
+ pxor %xmm1,%xmm1
+ ret
+.size abi_test_clobber_xmm1,.-abi_test_clobber_xmm1
+.type abi_test_clobber_xmm2, @function
+.globl abi_test_clobber_xmm2
+.hidden abi_test_clobber_xmm2
+.align 16
+abi_test_clobber_xmm2:
+_CET_ENDBR
+ pxor %xmm2,%xmm2
+ ret
+.size abi_test_clobber_xmm2,.-abi_test_clobber_xmm2
+.type abi_test_clobber_xmm3, @function
+.globl abi_test_clobber_xmm3
+.hidden abi_test_clobber_xmm3
+.align 16
+abi_test_clobber_xmm3:
+_CET_ENDBR
+ pxor %xmm3,%xmm3
+ ret
+.size abi_test_clobber_xmm3,.-abi_test_clobber_xmm3
+.type abi_test_clobber_xmm4, @function
+.globl abi_test_clobber_xmm4
+.hidden abi_test_clobber_xmm4
+.align 16
+abi_test_clobber_xmm4:
+_CET_ENDBR
+ pxor %xmm4,%xmm4
+ ret
+.size abi_test_clobber_xmm4,.-abi_test_clobber_xmm4
+.type abi_test_clobber_xmm5, @function
+.globl abi_test_clobber_xmm5
+.hidden abi_test_clobber_xmm5
+.align 16
+abi_test_clobber_xmm5:
+_CET_ENDBR
+ pxor %xmm5,%xmm5
+ ret
+.size abi_test_clobber_xmm5,.-abi_test_clobber_xmm5
+.type abi_test_clobber_xmm6, @function
+.globl abi_test_clobber_xmm6
+.hidden abi_test_clobber_xmm6
+.align 16
+abi_test_clobber_xmm6:
+_CET_ENDBR
+ pxor %xmm6,%xmm6
+ ret
+.size abi_test_clobber_xmm6,.-abi_test_clobber_xmm6
+.type abi_test_clobber_xmm7, @function
+.globl abi_test_clobber_xmm7
+.hidden abi_test_clobber_xmm7
+.align 16
+abi_test_clobber_xmm7:
+_CET_ENDBR
+ pxor %xmm7,%xmm7
+ ret
+.size abi_test_clobber_xmm7,.-abi_test_clobber_xmm7
+.type abi_test_clobber_xmm8, @function
+.globl abi_test_clobber_xmm8
+.hidden abi_test_clobber_xmm8
+.align 16
+abi_test_clobber_xmm8:
+_CET_ENDBR
+ pxor %xmm8,%xmm8
+ ret
+.size abi_test_clobber_xmm8,.-abi_test_clobber_xmm8
+.type abi_test_clobber_xmm9, @function
+.globl abi_test_clobber_xmm9
+.hidden abi_test_clobber_xmm9
+.align 16
+abi_test_clobber_xmm9:
+_CET_ENDBR
+ pxor %xmm9,%xmm9
+ ret
+.size abi_test_clobber_xmm9,.-abi_test_clobber_xmm9
+.type abi_test_clobber_xmm10, @function
+.globl abi_test_clobber_xmm10
+.hidden abi_test_clobber_xmm10
+.align 16
+abi_test_clobber_xmm10:
+_CET_ENDBR
+ pxor %xmm10,%xmm10
+ ret
+.size abi_test_clobber_xmm10,.-abi_test_clobber_xmm10
+.type abi_test_clobber_xmm11, @function
+.globl abi_test_clobber_xmm11
+.hidden abi_test_clobber_xmm11
+.align 16
+abi_test_clobber_xmm11:
+_CET_ENDBR
+ pxor %xmm11,%xmm11
+ ret
+.size abi_test_clobber_xmm11,.-abi_test_clobber_xmm11
+.type abi_test_clobber_xmm12, @function
+.globl abi_test_clobber_xmm12
+.hidden abi_test_clobber_xmm12
+.align 16
+abi_test_clobber_xmm12:
+_CET_ENDBR
+ pxor %xmm12,%xmm12
+ ret
+.size abi_test_clobber_xmm12,.-abi_test_clobber_xmm12
+.type abi_test_clobber_xmm13, @function
+.globl abi_test_clobber_xmm13
+.hidden abi_test_clobber_xmm13
+.align 16
+abi_test_clobber_xmm13:
+_CET_ENDBR
+ pxor %xmm13,%xmm13
+ ret
+.size abi_test_clobber_xmm13,.-abi_test_clobber_xmm13
+.type abi_test_clobber_xmm14, @function
+.globl abi_test_clobber_xmm14
+.hidden abi_test_clobber_xmm14
+.align 16
+abi_test_clobber_xmm14:
+_CET_ENDBR
+ pxor %xmm14,%xmm14
+ ret
+.size abi_test_clobber_xmm14,.-abi_test_clobber_xmm14
+.type abi_test_clobber_xmm15, @function
+.globl abi_test_clobber_xmm15
+.hidden abi_test_clobber_xmm15
+.align 16
+abi_test_clobber_xmm15:
+_CET_ENDBR
+ pxor %xmm15,%xmm15
+ ret
+.size abi_test_clobber_xmm15,.-abi_test_clobber_xmm15
+
+
+
+.type abi_test_bad_unwind_wrong_register, @function
+.globl abi_test_bad_unwind_wrong_register
+.hidden abi_test_bad_unwind_wrong_register
+.align 16
+abi_test_bad_unwind_wrong_register:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-16
+
+
+
+
+ nop
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+
+.cfi_endproc
+.size abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register
+
+
+
+
+.type abi_test_bad_unwind_temporary, @function
+.globl abi_test_bad_unwind_temporary
+.hidden abi_test_bad_unwind_temporary
+.align 16
+abi_test_bad_unwind_temporary:
+.cfi_startproc
+
+_CET_ENDBR
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+
+
+ movq %r12,%rax
+ incq %rax
+ movq %rax,(%rsp)
+
+
+
+ movq %r12,(%rsp)
+
+
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ ret
+.cfi_endproc
+
+.size abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary
+
+
+
+
+.type abi_test_set_direction_flag, @function
+.globl abi_test_get_and_clear_direction_flag
+.hidden abi_test_get_and_clear_direction_flag
+abi_test_get_and_clear_direction_flag:
+_CET_ENDBR
+ pushfq
+ popq %rax
+ andq $0x400,%rax
+ shrq $10,%rax
+ cld
+ ret
+.size abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag
+
+
+
+.type abi_test_set_direction_flag, @function
+.globl abi_test_set_direction_flag
+.hidden abi_test_set_direction_flag
+abi_test_set_direction_flag:
+_CET_ENDBR
+ std
+ ret
+.size abi_test_set_direction_flag,.-abi_test_set_direction_flag
+#endif
diff --git a/gen/test_support/trampoline-x86_64-win.asm b/gen/test_support/trampoline-x86_64-win.asm
new file mode 100644
index 0000000..ae04cbe
--- /dev/null
+++ b/gen/test_support/trampoline-x86_64-win.asm
@@ -0,0 +1,715 @@
+; This file is generated from a similarly-named Perl script in the BoringSSL
+; source tree. Do not edit by hand.
+
+%ifidn __OUTPUT_FORMAT__, win64
+default rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+%define _CET_ENDBR
+
+%ifdef BORINGSSL_PREFIX
+%include "boringssl_prefix_symbols_nasm.inc"
+%endif
+section .text code align=64
+
+
+
+
+
+
+
+
+
+
+global abi_test_trampoline
+ALIGN 16
+abi_test_trampoline:
+
+$L$SEH_begin_abi_test_trampoline_1:
+_CET_ENDBR
+
+
+
+
+
+
+
+
+
+ sub rsp,344
+
+$L$SEH_prolog_abi_test_trampoline_2:
+ mov QWORD[112+rsp],rbx
+
+$L$SEH_prolog_abi_test_trampoline_3:
+ mov QWORD[120+rsp],rbp
+
+$L$SEH_prolog_abi_test_trampoline_4:
+ mov QWORD[128+rsp],rdi
+
+$L$SEH_prolog_abi_test_trampoline_5:
+ mov QWORD[136+rsp],rsi
+
+$L$SEH_prolog_abi_test_trampoline_6:
+ mov QWORD[144+rsp],r12
+
+$L$SEH_prolog_abi_test_trampoline_7:
+ mov QWORD[152+rsp],r13
+
+$L$SEH_prolog_abi_test_trampoline_8:
+ mov QWORD[160+rsp],r14
+
+$L$SEH_prolog_abi_test_trampoline_9:
+ mov QWORD[168+rsp],r15
+
+$L$SEH_prolog_abi_test_trampoline_10:
+ movdqa XMMWORD[176+rsp],xmm6
+
+$L$SEH_prolog_abi_test_trampoline_11:
+ movdqa XMMWORD[192+rsp],xmm7
+
+$L$SEH_prolog_abi_test_trampoline_12:
+ movdqa XMMWORD[208+rsp],xmm8
+
+$L$SEH_prolog_abi_test_trampoline_13:
+ movdqa XMMWORD[224+rsp],xmm9
+
+$L$SEH_prolog_abi_test_trampoline_14:
+ movdqa XMMWORD[240+rsp],xmm10
+
+$L$SEH_prolog_abi_test_trampoline_15:
+ movdqa XMMWORD[256+rsp],xmm11
+
+$L$SEH_prolog_abi_test_trampoline_16:
+ movdqa XMMWORD[272+rsp],xmm12
+
+$L$SEH_prolog_abi_test_trampoline_17:
+ movdqa XMMWORD[288+rsp],xmm13
+
+$L$SEH_prolog_abi_test_trampoline_18:
+ movdqa XMMWORD[304+rsp],xmm14
+
+$L$SEH_prolog_abi_test_trampoline_19:
+ movdqa XMMWORD[320+rsp],xmm15
+
+$L$SEH_prolog_abi_test_trampoline_20:
+ mov rbx,QWORD[rdx]
+ mov rbp,QWORD[8+rdx]
+ mov rdi,QWORD[16+rdx]
+ mov rsi,QWORD[24+rdx]
+ mov r12,QWORD[32+rdx]
+ mov r13,QWORD[40+rdx]
+ mov r14,QWORD[48+rdx]
+ mov r15,QWORD[56+rdx]
+ movdqa xmm6,XMMWORD[64+rdx]
+ movdqa xmm7,XMMWORD[80+rdx]
+ movdqa xmm8,XMMWORD[96+rdx]
+ movdqa xmm9,XMMWORD[112+rdx]
+ movdqa xmm10,XMMWORD[128+rdx]
+ movdqa xmm11,XMMWORD[144+rdx]
+ movdqa xmm12,XMMWORD[160+rdx]
+ movdqa xmm13,XMMWORD[176+rdx]
+ movdqa xmm14,XMMWORD[192+rdx]
+ movdqa xmm15,XMMWORD[208+rdx]
+
+ mov QWORD[88+rsp],rcx
+ mov QWORD[96+rsp],rdx
+
+
+
+
+ mov r10,r8
+ mov r11,r9
+ dec r11
+ js NEAR $L$args_done
+ mov rcx,QWORD[r10]
+ add r10,8
+ dec r11
+ js NEAR $L$args_done
+ mov rdx,QWORD[r10]
+ add r10,8
+ dec r11
+ js NEAR $L$args_done
+ mov r8,QWORD[r10]
+ add r10,8
+ dec r11
+ js NEAR $L$args_done
+ mov r9,QWORD[r10]
+ add r10,8
+ lea rax,[32+rsp]
+$L$args_loop:
+ dec r11
+ js NEAR $L$args_done
+
+
+
+
+
+
+ mov QWORD[104+rsp],r11
+ mov r11,QWORD[r10]
+ mov QWORD[rax],r11
+ mov r11,QWORD[104+rsp]
+
+ add r10,8
+ add rax,8
+ jmp NEAR $L$args_loop
+
+$L$args_done:
+ mov rax,QWORD[88+rsp]
+ mov r10,QWORD[384+rsp]
+ test r10,r10
+ jz NEAR $L$no_unwind
+
+
+ pushfq
+ or QWORD[rsp],0x100
+ popfq
+
+
+
+ nop
+global abi_test_unwind_start
+abi_test_unwind_start:
+
+ call rax
+global abi_test_unwind_return
+abi_test_unwind_return:
+
+
+
+
+ pushfq
+ and QWORD[rsp],-0x101
+ popfq
+global abi_test_unwind_stop
+abi_test_unwind_stop:
+
+ jmp NEAR $L$call_done
+
+$L$no_unwind:
+ call rax
+
+$L$call_done:
+
+ mov rdx,QWORD[96+rsp]
+ mov QWORD[rdx],rbx
+ mov QWORD[8+rdx],rbp
+ mov QWORD[16+rdx],rdi
+ mov QWORD[24+rdx],rsi
+ mov QWORD[32+rdx],r12
+ mov QWORD[40+rdx],r13
+ mov QWORD[48+rdx],r14
+ mov QWORD[56+rdx],r15
+ movdqa XMMWORD[64+rdx],xmm6
+ movdqa XMMWORD[80+rdx],xmm7
+ movdqa XMMWORD[96+rdx],xmm8
+ movdqa XMMWORD[112+rdx],xmm9
+ movdqa XMMWORD[128+rdx],xmm10
+ movdqa XMMWORD[144+rdx],xmm11
+ movdqa XMMWORD[160+rdx],xmm12
+ movdqa XMMWORD[176+rdx],xmm13
+ movdqa XMMWORD[192+rdx],xmm14
+ movdqa XMMWORD[208+rdx],xmm15
+ mov rbx,QWORD[112+rsp]
+
+ mov rbp,QWORD[120+rsp]
+
+ mov rdi,QWORD[128+rsp]
+
+ mov rsi,QWORD[136+rsp]
+
+ mov r12,QWORD[144+rsp]
+
+ mov r13,QWORD[152+rsp]
+
+ mov r14,QWORD[160+rsp]
+
+ mov r15,QWORD[168+rsp]
+
+ movdqa xmm6,XMMWORD[176+rsp]
+
+ movdqa xmm7,XMMWORD[192+rsp]
+
+ movdqa xmm8,XMMWORD[208+rsp]
+
+ movdqa xmm9,XMMWORD[224+rsp]
+
+ movdqa xmm10,XMMWORD[240+rsp]
+
+ movdqa xmm11,XMMWORD[256+rsp]
+
+ movdqa xmm12,XMMWORD[272+rsp]
+
+ movdqa xmm13,XMMWORD[288+rsp]
+
+ movdqa xmm14,XMMWORD[304+rsp]
+
+ movdqa xmm15,XMMWORD[320+rsp]
+
+ add rsp,344
+
+
+
+ ret
+
+$L$SEH_end_abi_test_trampoline_21:
+
+
+global abi_test_clobber_rax
+ALIGN 16
+abi_test_clobber_rax:
+_CET_ENDBR
+ xor rax,rax
+ ret
+
+
+global abi_test_clobber_rbx
+ALIGN 16
+abi_test_clobber_rbx:
+_CET_ENDBR
+ xor rbx,rbx
+ ret
+
+
+global abi_test_clobber_rcx
+ALIGN 16
+abi_test_clobber_rcx:
+_CET_ENDBR
+ xor rcx,rcx
+ ret
+
+
+global abi_test_clobber_rdx
+ALIGN 16
+abi_test_clobber_rdx:
+_CET_ENDBR
+ xor rdx,rdx
+ ret
+
+
+global abi_test_clobber_rdi
+ALIGN 16
+abi_test_clobber_rdi:
+_CET_ENDBR
+ xor rdi,rdi
+ ret
+
+
+global abi_test_clobber_rsi
+ALIGN 16
+abi_test_clobber_rsi:
+_CET_ENDBR
+ xor rsi,rsi
+ ret
+
+
+global abi_test_clobber_rbp
+ALIGN 16
+abi_test_clobber_rbp:
+_CET_ENDBR
+ xor rbp,rbp
+ ret
+
+
+global abi_test_clobber_r8
+ALIGN 16
+abi_test_clobber_r8:
+_CET_ENDBR
+ xor r8,r8
+ ret
+
+
+global abi_test_clobber_r9
+ALIGN 16
+abi_test_clobber_r9:
+_CET_ENDBR
+ xor r9,r9
+ ret
+
+
+global abi_test_clobber_r10
+ALIGN 16
+abi_test_clobber_r10:
+_CET_ENDBR
+ xor r10,r10
+ ret
+
+
+global abi_test_clobber_r11
+ALIGN 16
+abi_test_clobber_r11:
+_CET_ENDBR
+ xor r11,r11
+ ret
+
+
+global abi_test_clobber_r12
+ALIGN 16
+abi_test_clobber_r12:
+_CET_ENDBR
+ xor r12,r12
+ ret
+
+
+global abi_test_clobber_r13
+ALIGN 16
+abi_test_clobber_r13:
+_CET_ENDBR
+ xor r13,r13
+ ret
+
+
+global abi_test_clobber_r14
+ALIGN 16
+abi_test_clobber_r14:
+_CET_ENDBR
+ xor r14,r14
+ ret
+
+
+global abi_test_clobber_r15
+ALIGN 16
+abi_test_clobber_r15:
+_CET_ENDBR
+ xor r15,r15
+ ret
+
+
+global abi_test_clobber_xmm0
+ALIGN 16
+abi_test_clobber_xmm0:
+_CET_ENDBR
+ pxor xmm0,xmm0
+ ret
+
+
+global abi_test_clobber_xmm1
+ALIGN 16
+abi_test_clobber_xmm1:
+_CET_ENDBR
+ pxor xmm1,xmm1
+ ret
+
+
+global abi_test_clobber_xmm2
+ALIGN 16
+abi_test_clobber_xmm2:
+_CET_ENDBR
+ pxor xmm2,xmm2
+ ret
+
+
+global abi_test_clobber_xmm3
+ALIGN 16
+abi_test_clobber_xmm3:
+_CET_ENDBR
+ pxor xmm3,xmm3
+ ret
+
+
+global abi_test_clobber_xmm4
+ALIGN 16
+abi_test_clobber_xmm4:
+_CET_ENDBR
+ pxor xmm4,xmm4
+ ret
+
+
+global abi_test_clobber_xmm5
+ALIGN 16
+abi_test_clobber_xmm5:
+_CET_ENDBR
+ pxor xmm5,xmm5
+ ret
+
+
+global abi_test_clobber_xmm6
+ALIGN 16
+abi_test_clobber_xmm6:
+_CET_ENDBR
+ pxor xmm6,xmm6
+ ret
+
+
+global abi_test_clobber_xmm7
+ALIGN 16
+abi_test_clobber_xmm7:
+_CET_ENDBR
+ pxor xmm7,xmm7
+ ret
+
+
+global abi_test_clobber_xmm8
+ALIGN 16
+abi_test_clobber_xmm8:
+_CET_ENDBR
+ pxor xmm8,xmm8
+ ret
+
+
+global abi_test_clobber_xmm9
+ALIGN 16
+abi_test_clobber_xmm9:
+_CET_ENDBR
+ pxor xmm9,xmm9
+ ret
+
+
+global abi_test_clobber_xmm10
+ALIGN 16
+abi_test_clobber_xmm10:
+_CET_ENDBR
+ pxor xmm10,xmm10
+ ret
+
+
+global abi_test_clobber_xmm11
+ALIGN 16
+abi_test_clobber_xmm11:
+_CET_ENDBR
+ pxor xmm11,xmm11
+ ret
+
+
+global abi_test_clobber_xmm12
+ALIGN 16
+abi_test_clobber_xmm12:
+_CET_ENDBR
+ pxor xmm12,xmm12
+ ret
+
+
+global abi_test_clobber_xmm13
+ALIGN 16
+abi_test_clobber_xmm13:
+_CET_ENDBR
+ pxor xmm13,xmm13
+ ret
+
+
+global abi_test_clobber_xmm14
+ALIGN 16
+abi_test_clobber_xmm14:
+_CET_ENDBR
+ pxor xmm14,xmm14
+ ret
+
+
+global abi_test_clobber_xmm15
+ALIGN 16
+abi_test_clobber_xmm15:
+_CET_ENDBR
+ pxor xmm15,xmm15
+ ret
+
+
+
+
+
+global abi_test_bad_unwind_wrong_register
+ALIGN 16
+abi_test_bad_unwind_wrong_register:
+
+$L$SEH_begin_abi_test_bad_unwind_wrong_register_1:
+_CET_ENDBR
+ push r12
+
+$L$SEH_prolog_abi_test_bad_unwind_wrong_register_2:
+
+
+
+ nop
+ pop r12
+
+ ret
+$L$SEH_end_abi_test_bad_unwind_wrong_register_3:
+
+
+
+
+
+
+
+global abi_test_bad_unwind_temporary
+ALIGN 16
+abi_test_bad_unwind_temporary:
+
+$L$SEH_begin_abi_test_bad_unwind_temporary_1:
+_CET_ENDBR
+ push r12
+
+$L$SEH_prolog_abi_test_bad_unwind_temporary_2:
+
+ mov rax,r12
+ inc rax
+ mov QWORD[rsp],rax
+
+
+
+ mov QWORD[rsp],r12
+
+
+ pop r12
+
+ ret
+
+$L$SEH_end_abi_test_bad_unwind_temporary_3:
+
+
+
+
+
+
+global abi_test_get_and_clear_direction_flag
+abi_test_get_and_clear_direction_flag:
+_CET_ENDBR
+ pushfq
+ pop rax
+ and rax,0x400
+ shr rax,10
+ cld
+ ret
+
+
+
+
+
+global abi_test_set_direction_flag
+abi_test_set_direction_flag:
+_CET_ENDBR
+ std
+ ret
+
+
+
+
+
+
+global abi_test_bad_unwind_epilog
+ALIGN 16
+abi_test_bad_unwind_epilog:
+$L$SEH_begin_abi_test_bad_unwind_epilog_1:
+ push r12
+$L$SEH_prolog_abi_test_bad_unwind_epilog_2:
+
+ nop
+
+
+ pop r12
+ nop
+ ret
+$L$SEH_end_abi_test_bad_unwind_epilog_3:
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_abi_test_trampoline_1 wrt ..imagebase
+ DD $L$SEH_end_abi_test_trampoline_21 wrt ..imagebase
+ DD $L$SEH_info_abi_test_trampoline_0 wrt ..imagebase
+
+ DD $L$SEH_begin_abi_test_bad_unwind_wrong_register_1 wrt ..imagebase
+ DD $L$SEH_end_abi_test_bad_unwind_wrong_register_3 wrt ..imagebase
+ DD $L$SEH_info_abi_test_bad_unwind_wrong_register_0 wrt ..imagebase
+
+ DD $L$SEH_begin_abi_test_bad_unwind_temporary_1 wrt ..imagebase
+ DD $L$SEH_end_abi_test_bad_unwind_temporary_3 wrt ..imagebase
+ DD $L$SEH_info_abi_test_bad_unwind_temporary_0 wrt ..imagebase
+
+ DD $L$SEH_begin_abi_test_bad_unwind_epilog_1 wrt ..imagebase
+ DD $L$SEH_end_abi_test_bad_unwind_epilog_3 wrt ..imagebase
+ DD $L$SEH_info_abi_test_bad_unwind_epilog_0 wrt ..imagebase
+
+
+section .xdata rdata align=8
+ALIGN 4
+$L$SEH_info_abi_test_trampoline_0:
+ DB 1
+ DB $L$SEH_prolog_abi_test_trampoline_20-$L$SEH_begin_abi_test_trampoline_1
+ DB 38
+ DB 0
+ DB $L$SEH_prolog_abi_test_trampoline_20-$L$SEH_begin_abi_test_trampoline_1
+ DB 248
+ DW 20
+ DB $L$SEH_prolog_abi_test_trampoline_19-$L$SEH_begin_abi_test_trampoline_1
+ DB 232
+ DW 19
+ DB $L$SEH_prolog_abi_test_trampoline_18-$L$SEH_begin_abi_test_trampoline_1
+ DB 216
+ DW 18
+ DB $L$SEH_prolog_abi_test_trampoline_17-$L$SEH_begin_abi_test_trampoline_1
+ DB 200
+ DW 17
+ DB $L$SEH_prolog_abi_test_trampoline_16-$L$SEH_begin_abi_test_trampoline_1
+ DB 184
+ DW 16
+ DB $L$SEH_prolog_abi_test_trampoline_15-$L$SEH_begin_abi_test_trampoline_1
+ DB 168
+ DW 15
+ DB $L$SEH_prolog_abi_test_trampoline_14-$L$SEH_begin_abi_test_trampoline_1
+ DB 152
+ DW 14
+ DB $L$SEH_prolog_abi_test_trampoline_13-$L$SEH_begin_abi_test_trampoline_1
+ DB 136
+ DW 13
+ DB $L$SEH_prolog_abi_test_trampoline_12-$L$SEH_begin_abi_test_trampoline_1
+ DB 120
+ DW 12
+ DB $L$SEH_prolog_abi_test_trampoline_11-$L$SEH_begin_abi_test_trampoline_1
+ DB 104
+ DW 11
+ DB $L$SEH_prolog_abi_test_trampoline_10-$L$SEH_begin_abi_test_trampoline_1
+ DB 244
+ DW 21
+ DB $L$SEH_prolog_abi_test_trampoline_9-$L$SEH_begin_abi_test_trampoline_1
+ DB 228
+ DW 20
+ DB $L$SEH_prolog_abi_test_trampoline_8-$L$SEH_begin_abi_test_trampoline_1
+ DB 212
+ DW 19
+ DB $L$SEH_prolog_abi_test_trampoline_7-$L$SEH_begin_abi_test_trampoline_1
+ DB 196
+ DW 18
+ DB $L$SEH_prolog_abi_test_trampoline_6-$L$SEH_begin_abi_test_trampoline_1
+ DB 100
+ DW 17
+ DB $L$SEH_prolog_abi_test_trampoline_5-$L$SEH_begin_abi_test_trampoline_1
+ DB 116
+ DW 16
+ DB $L$SEH_prolog_abi_test_trampoline_4-$L$SEH_begin_abi_test_trampoline_1
+ DB 84
+ DW 15
+ DB $L$SEH_prolog_abi_test_trampoline_3-$L$SEH_begin_abi_test_trampoline_1
+ DB 52
+ DW 14
+ DB $L$SEH_prolog_abi_test_trampoline_2-$L$SEH_begin_abi_test_trampoline_1
+ DB 1
+ DW 43
+
+$L$SEH_info_abi_test_bad_unwind_wrong_register_0:
+ DB 1
+ DB $L$SEH_prolog_abi_test_bad_unwind_wrong_register_2-$L$SEH_begin_abi_test_bad_unwind_wrong_register_1
+ DB 1
+ DB 0
+ DB $L$SEH_prolog_abi_test_bad_unwind_wrong_register_2-$L$SEH_begin_abi_test_bad_unwind_wrong_register_1
+ DB 208
+
+$L$SEH_info_abi_test_bad_unwind_temporary_0:
+ DB 1
+ DB $L$SEH_prolog_abi_test_bad_unwind_temporary_2-$L$SEH_begin_abi_test_bad_unwind_temporary_1
+ DB 1
+ DB 0
+ DB $L$SEH_prolog_abi_test_bad_unwind_temporary_2-$L$SEH_begin_abi_test_bad_unwind_temporary_1
+ DB 192
+
+$L$SEH_info_abi_test_bad_unwind_epilog_0:
+ DB 1
+ DB $L$SEH_prolog_abi_test_bad_unwind_epilog_2-$L$SEH_begin_abi_test_bad_unwind_epilog_1
+ DB 1
+ DB 0
+ DB $L$SEH_prolog_abi_test_bad_unwind_epilog_2-$L$SEH_begin_abi_test_bad_unwind_epilog_1
+ DB 192
+%else
+; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
+ret
+%endif
diff --git a/sources.cmake b/sources.cmake
index 94e1f5f..9f9d373 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -1,7 +1,7 @@
# This file contains source lists that are also consumed by
# generate_build_files.py.
#
-# TODO(davidben): Move the other source lists into this file.
+# TODO(crbug.com/boringssl/542): Move everything here into util/pregenerate.
set(
CRYPTO_TEST_SOURCES
diff --git a/util/generate_build_files.py b/util/generate_build_files.py
index 1b34dc2..d564a17 100644
--- a/util/generate_build_files.py
+++ b/util/generate_build_files.py
@@ -23,46 +23,6 @@
import json
-# OS_ARCH_COMBOS maps from OS and platform to the OpenSSL assembly "style" for
-# that platform and the extension used by asm files.
-#
-# TODO(https://crbug.com/boringssl/542): This probably should be a map, but some
-# downstream scripts import this to find what folders to add/remove from git.
-OS_ARCH_COMBOS = [
- ('apple', 'aarch64', 'ios64', [], 'S'),
- ('apple', 'x86', 'macosx', ['-fPIC'], 'S'),
- ('apple', 'x86_64', 'macosx', [], 'S'),
- ('linux', 'arm', 'linux32', [], 'S'),
- ('linux', 'aarch64', 'linux64', [], 'S'),
- ('linux', 'x86', 'elf', ['-fPIC'], 'S'),
- ('linux', 'x86_64', 'elf', [], 'S'),
- ('win', 'x86', 'win32n', [], 'asm'),
- ('win', 'x86_64', 'nasm', [], 'asm'),
- ('win', 'aarch64', 'win64', [], 'S'),
-]
-
-# NON_PERL_FILES enumerates assembly files that are not processed by the
-# perlasm system.
-NON_PERL_FILES = {
- ('apple', 'x86_64'): [
- 'src/third_party/fiat/asm/fiat_curve25519_adx_mul.S',
- 'src/third_party/fiat/asm/fiat_curve25519_adx_square.S',
- 'src/third_party/fiat/asm/fiat_p256_adx_mul.S',
- 'src/third_party/fiat/asm/fiat_p256_adx_sqr.S',
- ],
- ('linux', 'arm'): [
- 'src/crypto/curve25519/asm/x25519-asm-arm.S',
- 'src/crypto/poly1305/poly1305_arm_asm.S',
- ],
- ('linux', 'x86_64'): [
- 'src/crypto/hrss/asm/poly_rq_mul.S',
- 'src/third_party/fiat/asm/fiat_curve25519_adx_mul.S',
- 'src/third_party/fiat/asm/fiat_curve25519_adx_square.S',
- 'src/third_party/fiat/asm/fiat_p256_adx_mul.S',
- 'src/third_party/fiat/asm/fiat_p256_adx_sqr.S',
- ],
-}
-
PREFIX = None
EMBED_TEST_DATA = True
@@ -569,17 +529,6 @@
with open('sources.json', 'w+') as f:
json.dump(files, f, sort_keys=True, indent=2)
-def FindCMakeFiles(directory):
- """Returns list of all CMakeLists.txt files recursively in directory."""
- cmakefiles = []
-
- for (path, _, filenames) in os.walk(directory):
- for filename in filenames:
- if filename == 'CMakeLists.txt':
- cmakefiles.append(os.path.join(path, filename))
-
- return cmakefiles
-
def OnlyFIPSFragments(path, dent, is_dir):
return is_dir or (path.startswith(
os.path.join('src', 'crypto', 'fipsmodule', '')) and
@@ -679,85 +628,6 @@
return hfiles
-def ExtractPerlAsmFromCMakeFile(cmakefile):
- """Parses the contents of the CMakeLists.txt file passed as an argument and
- returns a list of all the perlasm() directives found in the file."""
- perlasms = []
- with open(cmakefile) as f:
- for line in f:
- line = line.strip()
- if not line.startswith('perlasm('):
- continue
- if not line.endswith(')'):
- raise ValueError('Bad perlasm line in %s' % cmakefile)
- # Remove "perlasm(" from start and ")" from end
- params = line[8:-1].split()
- if len(params) < 4:
- raise ValueError('Bad perlasm line in %s' % cmakefile)
- perlasms.append({
- 'arch': params[1],
- 'output': os.path.join(os.path.dirname(cmakefile), params[2]),
- 'input': os.path.join(os.path.dirname(cmakefile), params[3]),
- 'extra_args': params[4:],
- })
-
- return perlasms
-
-
-def ReadPerlAsmOperations():
- """Returns a list of all perlasm() directives found in CMake config files in
- src/."""
- perlasms = []
- cmakefiles = FindCMakeFiles('src')
-
- for cmakefile in cmakefiles:
- perlasms.extend(ExtractPerlAsmFromCMakeFile(cmakefile))
-
- return perlasms
-
-
-def PerlAsm(output_filename, input_filename, perlasm_style, extra_args):
- """Runs the a perlasm script and puts the output into output_filename."""
- base_dir = os.path.dirname(output_filename)
- if not os.path.isdir(base_dir):
- os.makedirs(base_dir)
- subprocess.check_call(
- ['perl', input_filename, perlasm_style] + extra_args + [output_filename])
-
-
-def WriteAsmFiles(perlasms):
- """Generates asm files from perlasm directives for each supported OS x
- platform combination."""
- asmfiles = {}
-
- for perlasm in perlasms:
- for (osname, arch, perlasm_style, extra_args, asm_ext) in OS_ARCH_COMBOS:
- if arch != perlasm['arch']:
- continue
- # TODO(https://crbug.com/boringssl/542): Now that we incorporate osname in
- # the output filename, the asm files can just go in a single directory.
- # For now, we keep them in target-specific directories to avoid breaking
- # downstream scripts.
- key = (osname, arch)
- outDir = '%s-%s' % key
- output = perlasm['output']
- if not output.startswith('src'):
- raise ValueError('output missing src: %s' % output)
- output = os.path.join(outDir, output[4:])
- output = '%s-%s.%s' % (output, osname, asm_ext)
- PerlAsm(output, perlasm['input'], perlasm_style,
- extra_args + perlasm['extra_args'])
- asmfiles.setdefault(key, []).append(output)
-
- for (key, non_perl_asm_files) in NON_PERL_FILES.items():
- asmfiles.setdefault(key, []).extend(non_perl_asm_files)
-
- for files in asmfiles.values():
- files.sort()
-
- return asmfiles
-
-
def ExtractVariablesFromCMakeFile(cmakefile):
"""Parses the contents of the CMakeLists.txt file passed as an argument and
returns a dictionary of exported source lists."""
@@ -792,7 +662,12 @@
def main(platforms):
+ # TODO(crbug.com/boringssl/542): Move everything to util/pregenerate and the
+ # new JSON file.
cmake = ExtractVariablesFromCMakeFile(os.path.join('src', 'sources.cmake'))
+ with open(os.path.join('src', 'gen', 'sources.json')) as f:
+ sources = json.load(f)
+
crypto_c_files = (FindCFiles(os.path.join('src', 'crypto'), NoTestsNorFIPSFragments) +
FindCFiles(os.path.join('src', 'third_party', 'fiat'), NoTestsNorFIPSFragments))
fips_fragments = FindCFiles(os.path.join('src', 'crypto', 'fipsmodule'), OnlyFIPSFragments)
@@ -805,12 +680,7 @@
os.path.join('src', 'crypto', 'fipsmodule', 'bcm.c')
]
- # Generate err_data.c
- with open('err_data.c', 'w+') as err_data:
- subprocess.check_call(['go', 'run', 'err_data_generate.go'],
- cwd=os.path.join('src', 'crypto', 'err'),
- stdout=err_data)
- crypto_c_files.append('err_data.c')
+ crypto_c_files += PrefixWithSrc(sources['crypto']['srcs'])
crypto_c_files.sort()
test_support_h_files = (
@@ -847,28 +717,19 @@
FindHeaderFiles(os.path.join('src', 'crypto'), NoTests) +
FindHeaderFiles(os.path.join('src', 'third_party', 'fiat'), NoTests))
- asm_outputs = sorted(WriteAsmFiles(ReadPerlAsmOperations()).items())
-
- # Generate combined source lists for gas and nasm. Some files appear in
- # multiple per-platform lists, so we de-duplicate.
- #
- # TODO(https://crbug.com/boringssl/542): It would be simpler to build the
- # combined source lists directly. This is a remnant of the previous assembly
- # strategy. When we move to pre-generated assembly files, this will be
- # removed.
- asm_sources = set()
- nasm_sources = set()
- for ((osname, arch), asm_files) in asm_outputs:
- if (osname, arch) in (('win', 'x86'), ('win', 'x86_64')):
- nasm_sources.update(asm_files)
- else:
- asm_sources.update(asm_files)
+ # TODO(crbug.com/boringssl/542): generate_build_files.py historically reported
+ # all the assembly files as part of libcrypto. Merge them for now, but we
+ # should split them out later.
+ crypto_asm = sorted(sources['bcm']['asm'] + sources['crypto']['asm'] +
+ sources['test_support']['asm'])
+ crypto_nasm = sorted(sources['bcm']['nasm'] + sources['crypto']['nasm'] +
+ sources['test_support']['nasm'])
files = {
'bcm_crypto': bcm_crypto_c_files,
'crypto': crypto_c_files,
- 'crypto_asm': sorted(list(asm_sources)),
- 'crypto_nasm': sorted(list(nasm_sources)),
+ 'crypto_asm': PrefixWithSrc(crypto_asm),
+ 'crypto_nasm': PrefixWithSrc(crypto_nasm),
'crypto_headers': crypto_h_files,
'crypto_internal_headers': crypto_internal_h_files,
'crypto_test': crypto_test_files,
diff --git a/util/pregenerate/build.go b/util/pregenerate/build.go
new file mode 100644
index 0000000..5f60960
--- /dev/null
+++ b/util/pregenerate/build.go
@@ -0,0 +1,284 @@
+// Copyright (c) 2024, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+package main
+
+import (
+ "bytes"
+ "cmp"
+ "encoding/json"
+ "fmt"
+ "path"
+ "slices"
+ "strings"
+)
+
+// An OutputTarget is a build target for consumption by the downstream build
+// systems. All pre-generated files are baked input its source lists.
+type OutputTarget struct {
+ // Srcs is the list of C or C++ files (determined by file extension) that are
+ // built into the target.
+ Srcs []string `json:"srcs,omitempty"`
+ // Hdrs is the list public headers that should be available to external
+ // projects using this target.
+ Hdrs []string `json:"hdrs,omitempty"`
+ // InternalHdrs is the list of internal headers that should be available to
+ // this target, as well as any internal targets using this target.
+ InternalHdrs []string `json:"internal_hdrs,omitempty"`
+ // Asm is the a list of assembly files to be passed to a gas-compatible
+ // assembler.
+ Asm []string `json:"asm,omitempty"`
+ // Nasm is the a list of assembly files to be passed to a nasm-compatible
+ // assembler.
+ Nasm []string `json:"nasm,omitempty"`
+ // Data is a list of test data files that should be available when the test is
+ // run.
+ Data []string `json:"data,omitempty"`
+}
+
+// An InputTarget is a build target with build inputs that still need to be
+// pregenerated.
+type InputTarget struct {
+ OutputTarget
+ // ErrData contains a list of errordata files to combine into err_data.c.
+ ErrData []string `json:"err_data,omitempty"`
+ // The following fields define perlasm sources for the corresponding
+ // architecture.
+ PerlasmAarch64 []PerlasmSource `json:"perlasm_aarch64,omitempty"`
+ PerlasmArm []PerlasmSource `json:"perlasm_arm,omitempty"`
+ PerlasmX86 []PerlasmSource `json:"perlasm_x86,omitempty"`
+ PerlasmX86_64 []PerlasmSource `json:"perlasm_x86_64,omitempty"`
+}
+
+type PerlasmSource struct {
+ // Src the path to the input perlasm file.
+ Src string `json:"src"`
+ // Dst, if not empty, is base name of the destination file. If empty, this
+ // is determined from Src by default. It should be overriden if a single
+ // source file generates multiple functions (e.g. SHA-256 vs SHA-512) or
+ // multiple architectures (e.g. the "armx" files).
+ Dst string `json:"dst,omitempty"`
+ // Args is a list of extra parameters to pass to the script.
+ Args []string `json:"args,omitempty"`
+}
+
+// Pregenerate converts an input target to an output target. It returns the
+// result alongside a list of tasks that must be run to build the referenced
+// files.
+func (in *InputTarget) Pregenerate(name string) (out OutputTarget, tasks []Task) {
+ out = in.OutputTarget
+
+ // Make copies of any fields we will write to.
+ out.Srcs = slices.Clone(out.Srcs)
+ out.Asm = slices.Clone(out.Asm)
+ out.Nasm = slices.Clone(out.Nasm)
+
+ addTask := func(list *[]string, t Task) {
+ tasks = append(tasks, t)
+ *list = append(*list, t.Destination())
+ }
+
+ if len(in.ErrData) != 0 {
+ addTask(&out.Srcs, &ErrDataTask{TargetName: name, Inputs: in.ErrData})
+ }
+
+ addPerlasmTask := func(list *[]string, p *PerlasmSource, fileSuffix string, args []string) {
+ dst := p.Dst
+ if len(p.Dst) == 0 {
+ dst = strings.TrimSuffix(path.Base(p.Src), ".pl")
+ }
+ dst = path.Join("gen", name, dst+fileSuffix)
+ args = append(slices.Clone(args), p.Args...)
+ addTask(list, &PerlasmTask{Src: p.Src, Dst: dst, Args: args})
+ }
+
+ for _, p := range in.PerlasmAarch64 {
+ addPerlasmTask(&out.Asm, &p, "-apple.S", []string{"ios64"})
+ addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"linux64"})
+ addPerlasmTask(&out.Asm, &p, "-win.S", []string{"win64"})
+ }
+ for _, p := range in.PerlasmArm {
+ addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"linux32"})
+ }
+ for _, p := range in.PerlasmX86 {
+ addPerlasmTask(&out.Asm, &p, "-apple.S", []string{"macosx", "-fPIC", "-DOPENSSL_IA32_SSE2"})
+ addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"elf", "-fPIC", "-DOPENSSL_IA32_SSE2"})
+ addPerlasmTask(&out.Nasm, &p, "-win.asm", []string{"win32n", "-fPIC", "-DOPENSSL_IA32_SSE2"})
+ }
+ for _, p := range in.PerlasmX86_64 {
+ addPerlasmTask(&out.Asm, &p, "-apple.S", []string{"macosx"})
+ addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"elf"})
+ addPerlasmTask(&out.Nasm, &p, "-win.asm", []string{"nasm"})
+ }
+
+ // Re-sort the modified fields.
+ slices.Sort(out.Srcs)
+ slices.Sort(out.Asm)
+ slices.Sort(out.Nasm)
+
+ return
+}
+
+func sortedKeys[K cmp.Ordered, V any](m map[K]V) []K {
+ keys := make([]K, 0, len(m))
+ for k := range m {
+ keys = append(keys, k)
+ }
+ slices.Sort(keys)
+ return keys
+}
+
+func writeHeader(b *bytes.Buffer, comment string) {
+ fmt.Fprintf(b, "%s Copyright (c) 2024, Google Inc.\n", comment)
+ fmt.Fprintf(b, "%s\n", comment)
+ fmt.Fprintf(b, "%s Permission to use, copy, modify, and/or distribute this software for any\n", comment)
+ fmt.Fprintf(b, "%s purpose with or without fee is hereby granted, provided that the above\n", comment)
+ fmt.Fprintf(b, "%s copyright notice and this permission notice appear in all copies.\n", comment)
+ fmt.Fprintf(b, "%s\n", comment)
+ fmt.Fprintf(b, "%s THE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES\n", comment)
+ fmt.Fprintf(b, "%s WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF\n", comment)
+ fmt.Fprintf(b, "%s MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY\n", comment)
+ fmt.Fprintf(b, "%s SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\n", comment)
+ fmt.Fprintf(b, "%s WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION\n", comment)
+ fmt.Fprintf(b, "%s OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN\n", comment)
+ fmt.Fprintf(b, "%s CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\n", comment)
+ fmt.Fprintf(b, "%s\n", comment)
+ fmt.Fprintf(b, "%s Generated by go ./util/pregenerate. Do not edit manually.\n", comment)
+}
+
+func buildVariablesTask(targets map[string]OutputTarget, dst, comment string, writeVariable func(b *bytes.Buffer, name string, val []string)) Task {
+ return NewSimpleTask(dst, func() ([]byte, error) {
+ var b bytes.Buffer
+ writeHeader(&b, comment)
+
+ for _, name := range sortedKeys(targets) {
+ target := targets[name]
+ if len(target.Srcs) != 0 {
+ writeVariable(&b, name+"_sources", target.Srcs)
+ }
+ if len(target.Hdrs) != 0 {
+ writeVariable(&b, name+"_headers", target.Hdrs)
+ }
+ if len(target.InternalHdrs) != 0 {
+ writeVariable(&b, name+"_internal_headers", target.InternalHdrs)
+ }
+ if len(target.Asm) != 0 {
+ writeVariable(&b, name+"_sources_asm", target.Asm)
+ }
+ if len(target.Nasm) != 0 {
+ writeVariable(&b, name+"_sources_nasm", target.Nasm)
+ }
+ if len(target.Data) != 0 {
+ writeVariable(&b, name+"_data", target.Data)
+ }
+ }
+
+ return b.Bytes(), nil
+ })
+}
+
+func writeBazelVariable(b *bytes.Buffer, name string, val []string) {
+ fmt.Fprintf(b, "\n%s = [\n", name)
+ for _, v := range val {
+ fmt.Fprintf(b, " %q,\n", v)
+ }
+ fmt.Fprintf(b, "]\n")
+}
+
+func writeCMakeVariable(b *bytes.Buffer, name string, val []string) {
+ fmt.Fprintf(b, "\nset(\n")
+ fmt.Fprintf(b, " %s\n\n", strings.ToUpper(name))
+ for _, v := range val {
+ fmt.Fprintf(b, " %s\n", v)
+ }
+ fmt.Fprintf(b, ")\n")
+}
+
+func writeMakeVariable(b *bytes.Buffer, name string, val []string) {
+ fmt.Fprintf(b, "\n%s := \\\n", name)
+ for i, v := range val {
+ if i == len(val)-1 {
+ fmt.Fprintf(b, " %s\n", v)
+ } else {
+ fmt.Fprintf(b, " %s \\\n", v)
+ }
+ }
+}
+
+func writeGNVariable(b *bytes.Buffer, name string, val []string) {
+ // Bazel and GN have the same syntax similar syntax.
+ writeBazelVariable(b, name, val)
+}
+
+func jsonTask(targets map[string]OutputTarget, dst string) Task {
+ return NewSimpleTask(dst, func() ([]byte, error) {
+ return json.MarshalIndent(targets, "", " ")
+ })
+}
+
+func soongTask(targets map[string]OutputTarget, dst string) Task {
+ return NewSimpleTask(dst, func() ([]byte, error) {
+ var b bytes.Buffer
+ writeHeader(&b, "//")
+
+ writeAttribute := func(indent, name string, val []string) {
+ fmt.Fprintf(&b, "%s%s: [\n", indent, name)
+ for _, v := range val {
+ fmt.Fprintf(&b, "%s %q,\n", indent, v)
+ }
+ fmt.Fprintf(&b, "%s],\n", indent)
+
+ }
+
+ for _, name := range sortedKeys(targets) {
+ target := targets[name]
+ fmt.Fprintf(&b, "\ncc_defaults {\n")
+ fmt.Fprintf(&b, " name: %q\n", "boringssl_"+name+"_sources")
+ if len(target.Srcs) != 0 {
+ writeAttribute(" ", "srcs", target.Srcs)
+ }
+ if len(target.Data) != 0 {
+ writeAttribute(" ", "data", target.Data)
+ }
+ if len(target.Asm) != 0 {
+ fmt.Fprintf(&b, " target: {\n")
+ // Only emit asm for Linux. On Windows, BoringSSL requires NASM, which is
+ // not available in AOSP. On Darwin, the assembly works fine, but it
+ // conflicts with Android's FIPS build. See b/294399371.
+ fmt.Fprintf(&b, " linux: {\n")
+ writeAttribute(" ", "srcs", target.Asm)
+ fmt.Fprintf(&b, " },\n")
+ fmt.Fprintf(&b, " darwin: {\n")
+ fmt.Fprintf(&b, " cflags: [\"-DOPENSSL_NO_ASM\"],\n")
+ fmt.Fprintf(&b, " },\n")
+ fmt.Fprintf(&b, " windows: {\n")
+ fmt.Fprintf(&b, " cflags: [\"-DOPENSSL_NO_ASM\"],\n")
+ fmt.Fprintf(&b, " },\n")
+ fmt.Fprintf(&b, " },\n")
+ }
+ fmt.Fprintf(&b, "},\n")
+ }
+
+ return b.Bytes(), nil
+ })
+}
+
+func MakeBuildFiles(targets map[string]OutputTarget) []Task {
+ // TODO(crbug.com/boringssl/542): Generate the build files for the other
+ // types as well.
+ return []Task{
+ buildVariablesTask(targets, "gen/sources.cmake", "#", writeCMakeVariable),
+ jsonTask(targets, "gen/sources.json"),
+ }
+}
diff --git a/crypto/err/err_data_generate.go b/util/pregenerate/err_data.go
similarity index 85%
rename from crypto/err/err_data_generate.go
rename to util/pregenerate/err_data.go
index d4a7c28..8d89d99 100644
--- a/crypto/err/err_data_generate.go
+++ b/util/pregenerate/err_data.go
@@ -12,25 +12,20 @@
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-//go:build ignore
-
package main
import (
"bufio"
"bytes"
"errors"
- "flag"
"fmt"
"io"
"os"
+ "path"
"sort"
"strconv"
- "strings"
)
-var verbose = flag.Bool("verbose", false, "If true, prints a status message at the end.")
-
// libraryNames must be kept in sync with the enum in err.h. The generated code
// will contain static assertions to enforce this.
var libraryNames = []string{
@@ -129,10 +124,6 @@
func (st *stringList) WriteTo(out stringWriter, name string) {
list := st.buildList()
- if *verbose {
- fmt.Fprintf(os.Stderr, "%s: %d bytes of list and %d bytes of string data.\n", name, 4*len(list), len(st.stringData))
- }
-
values := "kOpenSSL" + name + "Values"
out.WriteString("const uint32_t " + values + "[] = {\n")
for _, v := range list {
@@ -207,9 +198,16 @@
return scanner.Err()
}
-func main() {
- flag.Parse()
+type ErrDataTask struct {
+ TargetName string
+ Inputs []string
+}
+func (t *ErrDataTask) Destination() string {
+ return path.Join("gen", t.TargetName, "err_data.c")
+}
+
+func (t *ErrDataTask) Run() ([]byte, error) {
e := &errorData{
reasons: newStringList(),
libraryMap: make(map[string]uint32),
@@ -218,27 +216,13 @@
e.libraryMap[name] = uint32(i) + 1
}
- cwd, err := os.Open(".")
- if err != nil {
- panic(err)
- }
- names, err := cwd.Readdirnames(-1)
- if err != nil {
- panic(err)
- }
-
- sort.Strings(names)
- for _, name := range names {
- if !strings.HasSuffix(name, ".errordata") {
- continue
- }
- if err := e.readErrorDataFile(name); err != nil {
- panic(err)
+ for _, input := range t.Inputs {
+ if err := e.readErrorDataFile(input); err != nil {
+ return nil, err
}
}
- out := os.Stdout
-
+ var out bytes.Buffer
out.WriteString(`/* Copyright (c) 2015, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
@@ -253,7 +237,7 @@
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
- /* This file was generated by err_data_generate.go. */
+ /* This file was generated by go run ./util/pregenerate. */
#include <openssl/base.h>
#include <openssl/err.h>
@@ -263,10 +247,11 @@
`)
for i, name := range libraryNames {
- fmt.Fprintf(out, "static_assert(ERR_LIB_%s == %d, \"library value changed\");\n", name, i+1)
+ fmt.Fprintf(&out, "static_assert(ERR_LIB_%s == %d, \"library value changed\");\n", name, i+1)
}
- fmt.Fprintf(out, "static_assert(ERR_NUM_LIBS == %d, \"number of libraries changed\");\n", len(libraryNames)+1)
+ fmt.Fprintf(&out, "static_assert(ERR_NUM_LIBS == %d, \"number of libraries changed\");\n", len(libraryNames)+1)
out.WriteString("\n")
- e.reasons.WriteTo(out, "Reason")
+ e.reasons.WriteTo(&out, "Reason")
+ return out.Bytes(), nil
}
diff --git a/util/pregenerate/pregenerate.go b/util/pregenerate/pregenerate.go
new file mode 100644
index 0000000..ba062c7
--- /dev/null
+++ b/util/pregenerate/pregenerate.go
@@ -0,0 +1,218 @@
+// Copyright (c) 2024, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// pregenerate manages generated files in BoringSSL
+package main
+
+import (
+ "bytes"
+ "encoding/json"
+ "errors"
+ "flag"
+ "fmt"
+ "os"
+ "path/filepath"
+ "runtime"
+ "slices"
+ "strings"
+ "sync"
+)
+
+var (
+ check = flag.Bool("check", false, "Check whether any files need to be updated, without actually updating them")
+ numWorkers = flag.Int("num-workers", runtime.NumCPU(), "Runs the given number of workers")
+ dryRun = flag.Bool("dry-run", false, "Skip actually writing any files")
+ perlPath = flag.String("perl", "perl", "Path to the perl command")
+ list = flag.Bool("list", false, "List all generated files, rather than actually run them")
+)
+
+func runTask(t Task) error {
+ expected, err := t.Run()
+ if err != nil {
+ return err
+ }
+
+ dst := t.Destination()
+ dstPath := filepath.FromSlash(dst)
+ if *check {
+ actual, err := os.ReadFile(dstPath)
+ if err != nil {
+ if os.IsNotExist(err) {
+ err = errors.New("missing file")
+ }
+ return err
+ }
+
+ if !bytes.Equal(expected, actual) {
+ return errors.New("file out of date")
+ }
+ return nil
+ }
+
+ if *dryRun {
+ fmt.Printf("Would write %d bytes to %q\n", len(expected), dst)
+ return nil
+ }
+
+ if err := os.MkdirAll(filepath.Dir(dstPath), 0777); err != nil {
+ return err
+ }
+ return os.WriteFile(dstPath, expected, 0666)
+}
+
+type taskError struct {
+ dst string
+ err error
+}
+
+func worker(taskChan <-chan Task, errorChan chan<- taskError, wg *sync.WaitGroup) {
+ defer wg.Done()
+ for t := range taskChan {
+ if err := runTask(t); err != nil {
+ errorChan <- taskError{t.Destination(), err}
+ }
+ }
+}
+
+func run() error {
+ if _, err := os.Stat("BUILDING.md"); err != nil {
+ return fmt.Errorf("must be run from BoringSSL source root")
+ }
+
+ buildJSON, err := os.ReadFile("build.json")
+ if err != nil {
+ return err
+ }
+
+ // Remove comments. For now, just do a very basic preprocessing step. If
+ // needed, we can switch to something well-defined like one of the many
+ // dozen different extended JSONs like JSON5.
+ lines := bytes.Split(buildJSON, []byte("\n"))
+ for i := range lines {
+ if idx := bytes.Index(lines[i], []byte("//")); idx >= 0 {
+ lines[i] = lines[i][:idx]
+ }
+ }
+ buildJSON = bytes.Join(lines, []byte("\n"))
+
+ var targetsIn map[string]InputTarget
+ if err := json.Unmarshal(buildJSON, &targetsIn); err != nil {
+ return fmt.Errorf("error decoding build config: %s", err)
+ }
+
+ var tasks []Task
+ targetsOut := make(map[string]OutputTarget)
+ for name, targetIn := range targetsIn {
+ targetOut, targetTasks := targetIn.Pregenerate(name)
+ targetsOut[name] = targetOut
+ tasks = append(tasks, targetTasks...)
+ }
+
+ tasks = append(tasks, MakeBuildFiles(targetsOut)...)
+ tasks = append(tasks, NewSimpleTask("gen/README.md", func() ([]byte, error) {
+ return []byte(readme), nil
+ }))
+
+ // Filter tasks by command-line argument.
+ if args := flag.Args(); len(args) != 0 {
+ var filtered []Task
+ for _, t := range tasks {
+ dst := t.Destination()
+ for _, arg := range args {
+ if strings.Contains(dst, arg) {
+ filtered = append(filtered, t)
+ break
+ }
+ }
+ }
+ tasks = filtered
+ }
+
+ if *list {
+ paths := make([]string, len(tasks))
+ for i, t := range tasks {
+ paths[i] = t.Destination()
+ }
+ slices.Sort(paths)
+ for _, p := range paths {
+ fmt.Println(p)
+ }
+ return nil
+ }
+
+ // Schedule tasks in parallel. Perlasm benefits from running in parallel. The
+ // others likely do not, but it is simpler to parallelize them all.
+ var wg sync.WaitGroup
+ taskChan := make(chan Task, *numWorkers)
+ errorChan := make(chan taskError, *numWorkers)
+ for i := 0; i < *numWorkers; i++ {
+ wg.Add(1)
+ go worker(taskChan, errorChan, &wg)
+ }
+
+ go func() {
+ for _, t := range tasks {
+ taskChan <- t
+ }
+ close(taskChan)
+ wg.Wait()
+ close(errorChan)
+ }()
+
+ var failed bool
+ for err := range errorChan {
+ fmt.Fprintf(os.Stderr, "Error in file %q: %s\n", err.dst, err.err)
+ failed = true
+ }
+ if failed {
+ return errors.New("some files had errors")
+ }
+ return nil
+}
+
+func main() {
+ flag.Parse()
+ if err := run(); err != nil {
+ fmt.Fprintf(os.Stderr, "Error: %s\n", err)
+ os.Exit(1)
+ }
+}
+
+const readme = `# Pre-generated files
+
+This directory contains a number of pre-generated build artifacts. To simplify
+downstream builds, they are checked into the repository, rather than dynamically
+generated as part of the build.
+
+When developing on BoringSSL, if any inputs to these files are modified, callers
+must run the following command to update the generated files:
+
+ go run ./util/pregenerate
+
+To check that files are up-to-date without updating files, run:
+
+ go run ./util/pregenerate -check
+
+This is run on CI to ensure the generated files remain up-to-date.
+
+To speed up local iteration, the tool accepts additional arguments to filter the
+files generated. For example, if editing ` + "`aesni-x86_64.pl`" + `, this
+command will only update files with "aesni-x86_64" as a substring.
+
+ go run ./util/pregenerate aesni-x86_64
+
+For convenience, all files in this directory, including this README, are managed
+by the tool. This means the whole directory may be deleted and regenerated from
+scratch at any time.
+`
diff --git a/util/pregenerate/task.go b/util/pregenerate/task.go
new file mode 100644
index 0000000..f04fc43
--- /dev/null
+++ b/util/pregenerate/task.go
@@ -0,0 +1,82 @@
+// Copyright (c) 2024, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+package main
+
+import (
+ "bytes"
+ "os"
+ "os/exec"
+ "path"
+ "path/filepath"
+)
+
+type Task interface {
+ // Destination returns the destination path for this task, using forward
+ // slashes and relative to the source directory. That is, use the "path"
+ // package, not "path/filepath".
+ Destination() string
+
+ // Run computes the output for this task. It should be written to the
+ // destination path.
+ Run() ([]byte, error)
+}
+
+type SimpleTask struct {
+ Dst string
+ RunFunc func() ([]byte, error)
+}
+
+func (t *SimpleTask) Destination() string { return t.Dst }
+func (t *SimpleTask) Run() ([]byte, error) { return t.RunFunc() }
+
+func NewSimpleTask(dst string, runFunc func() ([]byte, error)) *SimpleTask {
+ return &SimpleTask{Dst: dst, RunFunc: runFunc}
+}
+
+type PerlasmTask struct {
+ Src, Dst string
+ Args []string
+}
+
+func (t *PerlasmTask) Destination() string { return t.Dst }
+func (t *PerlasmTask) Run() ([]byte, error) {
+ base := path.Base(t.Dst)
+ out, err := os.CreateTemp("", "*."+base)
+ if err != nil {
+ return nil, err
+ }
+ defer os.Remove(out.Name())
+
+ args := make([]string, 0, 2+len(t.Args))
+ args = append(args, filepath.FromSlash(t.Src))
+ args = append(args, t.Args...)
+ args = append(args, out.Name())
+ cmd := exec.Command(*perlPath, args...)
+ cmd.Stderr = os.Stderr
+ cmd.Stdout = os.Stdout
+ if err := cmd.Run(); err != nil {
+ return nil, err
+ }
+
+ data, err := os.ReadFile(out.Name())
+ if err != nil {
+ return nil, err
+ }
+
+ // On Windows, perl emits CRLF line endings. Normalize this so that the tool
+ // can be run on Windows too.
+ data = bytes.ReplaceAll(data, []byte("\r\n"), []byte("\n"))
+ return data, nil
+}